From a4bef0ca826a8145ef3cb288846017c034a817c2 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:15:50 -0700
Subject: [PATCH 001/230] [libc++] Mark P2845R8 `__cpp_lib_format_path` and
 P2587R3 `__cpp_lib_to_string` as C++26 (#93255)

[P2845R8](https://wg21.link/P2845R8) "Formatting of
`std::filesystem::path`" and [P2587R3](https://wg21.link/P2587R3)
"`to_string` or not `to_string`" are C++26 features, so they should be
marked accordingly in `generate_feature_test_macro_components.py`.

I verified that without my changes, running the script produced no
edits. Then with my changes, I ran the script to regenerate all files,
with no other manual edits.

Found while running libc++'s tests with MSVC's STL, which noticed this
because it's currently a C++23-only implementation.

Note that @H-G-Hristov has a draft implementation of P2587R3: #78100
---
 libcxx/docs/FeatureTestMacroTable.rst         |  8 ++--
 libcxx/include/version                        |  4 +-
 .../filesystem.version.compile.pass.cpp       | 23 +++-------
 .../string.version.compile.pass.cpp           | 23 +++-------
 .../version.version.compile.pass.cpp          | 46 ++++++-------------
 .../generate_feature_test_macro_components.py |  4 +-
 6 files changed, 36 insertions(+), 72 deletions(-)

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 17d2da907692e8..0297068785e8b8 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -326,8 +326,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_expected``                                     ``202211L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_format_path``                                  *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_format_ranges``                                ``202207L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_formatters``                                   *unimplemented*
@@ -386,8 +384,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_string_resize_and_overwrite``                  ``202110L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_to_string``                                    *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_underlying``                                ``202102L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
@@ -412,6 +408,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_default_template_type_for_algorithm_values``   *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_format_path``                                  *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_algorithm``                       *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_array``                           *unimplemented*
@@ -466,6 +464,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_chars``                                     *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_to_string``                                    *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
     ========================================================== =================
 
diff --git a/libcxx/include/version b/libcxx/include/version
index 69556d731f1cfc..140a9a0d870360 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -459,7 +459,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_constexpr_typeinfo                   202106L
 # define __cpp_lib_containers_ranges                    202202L
 # define __cpp_lib_expected                             202211L
-// # define __cpp_lib_format_path                          202403L
 # define __cpp_lib_format_ranges                        202207L
 // # define __cpp_lib_formatters                           202302L
 # define __cpp_lib_forward_like                         202207L
@@ -490,7 +489,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_stdatomic_h                          202011L
 # define __cpp_lib_string_contains                      202011L
 # define __cpp_lib_string_resize_and_overwrite          202110L
-// # define __cpp_lib_to_string                            202306L
 # define __cpp_lib_to_underlying                        202102L
 // # define __cpp_lib_tuple_like                           202207L
 # define __cpp_lib_unreachable                          202202L
@@ -506,6 +504,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_copyable_function                    202306L
 // # define __cpp_lib_debugging                            202311L
 // # define __cpp_lib_default_template_type_for_algorithm_values 202403L
+// # define __cpp_lib_format_path                          202403L
 // # define __cpp_lib_freestanding_algorithm               202311L
 // # define __cpp_lib_freestanding_array                   202311L
 // # define __cpp_lib_freestanding_cstring                 202306L
@@ -537,6 +536,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_text_encoding                        202306L
 # undef  __cpp_lib_to_chars
 // # define __cpp_lib_to_chars                             202306L
+// # define __cpp_lib_to_string                            202306L
 # undef  __cpp_lib_tuple_like
 // # define __cpp_lib_tuple_like                           202311L
 #endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 308cc2d43b0586..4aba33482f69c4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -20,7 +20,7 @@
 /*  Constant                 Value
     __cpp_lib_char8_t        201907L [C++20]
     __cpp_lib_filesystem     201703L [C++17]
-    __cpp_lib_format_path    202403L [C++23]
+    __cpp_lib_format_path    202403L [C++26]
 */
 
 #include <filesystem>
@@ -37,7 +37,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -51,7 +51,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -74,7 +74,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -106,7 +106,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -137,17 +137,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 16a9a0a28de635..af6386a40a458a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -29,7 +29,7 @@
     __cpp_lib_string_udls                                   201304L [C++14]
     __cpp_lib_string_view                                   201606L [C++17]
                                                             201803L [C++20]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
 */
 
 #include <string>
@@ -86,7 +86,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -143,7 +143,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -209,7 +209,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -293,7 +293,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -385,17 +385,8 @@
 #   error "__cpp_lib_string_view should have the value 201803L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 7829e06f90760b..c1e1f9f340af48 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -88,7 +88,7 @@
     __cpp_lib_expected                                      202211L [C++23]
     __cpp_lib_filesystem                                    201703L [C++17]
     __cpp_lib_format                                        202106L [C++20]
-    __cpp_lib_format_path                                   202403L [C++23]
+    __cpp_lib_format_path                                   202403L [C++26]
     __cpp_lib_format_ranges                                 202207L [C++23]
     __cpp_lib_format_uchar                                  202311L [C++20]
     __cpp_lib_formatters                                    202302L [C++23]
@@ -216,7 +216,7 @@
     __cpp_lib_to_array                                      201907L [C++20]
     __cpp_lib_to_chars                                      201611L [C++17]
                                                             202306L [C++26]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
     __cpp_lib_to_underlying                                 202102L [C++23]
     __cpp_lib_transformation_trait_aliases                  201304L [C++14]
     __cpp_lib_transparent_operators                         201210L [C++14]
@@ -513,7 +513,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1005,7 +1005,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -1348,7 +1348,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1891,7 +1891,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -2303,7 +2303,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -2972,7 +2972,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -3543,7 +3543,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -4350,7 +4350,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -4971,17 +4971,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_format_ranges
@@ -5943,17 +5934,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_to_underlying
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b04cb4f5115547..1e79f6c140758c 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -515,7 +515,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_format_path",
-            "values": {"c++23": 202403}, # P2845R8: Formatting of std::filesystem::path
+            "values": {"c++26": 202403},  # P2845R8: Formatting of std::filesystem::path
             "headers": ["filesystem"],
             "unimplemented": True,
         },
@@ -1270,7 +1270,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_to_string",
-            "values": {"c++23": 202306},  # P2587R3 to_string or not to_string
+            "values": {"c++26": 202306},  # P2587R3 to_string or not to_string
             "headers": ["string"],
             "unimplemented": True,
         },

From 51752ed0dd737f12014a89dec67d25494083153d Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 28 May 2024 21:17:31 +0200
Subject: [PATCH 002/230] [mlir][nvgpu] verify the module

---
 mlir/test/Examples/NVGPU/tools/nvdsl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py
index 600cae5b47eeec..90dbb2355e1c87 100644
--- a/mlir/test/Examples/NVGPU/tools/nvdsl.py
+++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py
@@ -431,7 +431,7 @@ def __str__(self):
                 # saveIR(module)
 
                 # Verify the module
-                # module.operation.verify()
+                module.operation.verify()
 
                 # Compile and JIT MLIR module
                 options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3"

From 266fac8375bdf3f039503c559bb16ffab8895ae5 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:17:57 -0700
Subject: [PATCH 003/230] [libc++] [test] Fix MSVC warnings (#93257)

Found while running libc++'s tests with MSVC's STL.

* Avoid MSVC warning C5101: use of preprocessor directive in
function-like macro argument list is undefined behavior.
+ We can easily make this portable by extracting `const bool is_newlib`.
  + Followup to #73440.
  + See #73598.
  + See #73836.
* Avoid MSVC warning C4267: 'return': conversion from 'size_t' to 'int',
possible loss of data.
+ This warning is valid, but harmless for the test, so
`static_cast<int>` will avoid it.
* Avoid MSVC warning C4146: unary minus operator applied to unsigned
type, result still unsigned.
+ This warning is also valid (the scenario is sometimes intentional, but
surprising enough that it's worth warning about). This is a C++17 test,
so we can easily avoid it by testing `is_signed_v` at compile-time
before testing `m < 0` and `n < 0` at run-time.
* Silence MSVC warning C4310: cast truncates constant value.
+ These warnings are being emitted by `T(255)`. Disabling the warning is
simpler than attempting to restructure the code.
  + Followup to #79791.
* MSVC no longer emits warning C4521: multiple copy constructors
specified.
+ This warning was removed from the compiler, since at least 2021-12-09.
---
 .../atomics.ref/compare_exchange_strong.pass.cpp      |  3 +++
 .../atomics.ref/compare_exchange_weak.pass.cpp        |  3 +++
 libcxx/test/std/atomics/atomics.ref/wait.pass.cpp     |  3 +++
 .../views.span/span.cons/initializer_list.pass.cpp    |  4 ++--
 .../syserr.errcat.objects/generic_category.pass.cpp   | 11 +++++++----
 .../syserr.errcat.objects/system_category.pass.cpp    | 11 +++++++----
 .../numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp | 10 ++++++----
 libcxx/test/support/msvc_stdlib_force_include.h       |  1 -
 8 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
index 72b2f444c476c7..90aa5ea5b6df45 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
index 5219a8e3714f98..99c1385a2fe0b7 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
index e5310febf5c5eb..f246803ba25925 100644
--- a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
@@ -11,6 +11,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // void wait(T, memory_order = memory_order::seq_cst) const noexcept;
 
 #include <atomic>
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
index 74a5094f61261d..bc76e23fea3c03 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
@@ -93,9 +93,9 @@ constexpr bool test() {
 
 // Test P2447R4 "Annex C examples"
 
-constexpr int three(std::span<void* const> sp) { return sp.size(); }
+constexpr int three(std::span<void* const> sp) { return static_cast<int>(sp.size()); }
 
-constexpr int four(std::span<const std::any> sp) { return sp.size(); }
+constexpr int four(std::span<const std::any> sp) { return static_cast<int>(sp.size()); }
 
 bool test_P2447R4_annex_c_examples() {
   // 1. Overload resolution is affected
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index d4bbde75ae8821..7283fdc769d86b 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -50,13 +50,16 @@ int main(int, char**)
         // responds with an empty message, which we probably want to
         // treat as a failure code otherwise, but we can detect that
         // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+        const bool is_newlib = true;
+#else
+        const bool is_newlib = false;
+#endif
+        (void)is_newlib;
         LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                       || msg.rfind("No error information", 0) == 0 // Musl
                       || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                      || msg.empty()
-#endif
-        );
+                      || (is_newlib && msg.empty()));
         assert(errno == E2BIG);
     }
 
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index eefbddd27a7f53..02a1baf5999831 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -56,13 +56,16 @@ int main(int, char**) {
     // responds with an empty message, which we probably want to
     // treat as a failure code otherwise, but we can detect that
     // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+    const bool is_newlib = true;
+#else
+    const bool is_newlib = false;
+#endif
+    (void)is_newlib;
     LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                   || msg.rfind("No error information", 0) == 0 // Musl
                   || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                  || msg.empty()
-#endif
-    );
+                  || (is_newlib && msg.empty()));
     assert(errno == E2BIG);
   }
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 212804356a056d..bf40b174b209cc 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -57,10 +57,12 @@ T basic_gcd_(T m, T n) {
 template <typename T>
 T basic_gcd(T m, T n) {
   using Tp = std::make_unsigned_t<T>;
-  if (m < 0 && m != std::numeric_limits<T>::min())
-    m = -m;
-  if (n < 0 && n != std::numeric_limits<T>::min())
-    n = -n;
+  if constexpr (std::is_signed_v<T>) {
+    if (m < 0 && m != std::numeric_limits<T>::min())
+      m = -m;
+    if (n < 0 && n != std::numeric_limits<T>::min())
+      n = -n;
+  }
   return basic_gcd_(static_cast<Tp>(m), static_cast<Tp>(n));
 }
 
diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index 6c26085e72c45f..35783c1607b0e0 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -67,7 +67,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 // Silence compiler warnings.
 #  pragma warning(disable : 4180)  // qualifier applied to function type has no meaning; ignored
 #  pragma warning(disable : 4324)  // structure was padded due to alignment specifier
-#  pragma warning(disable : 4521)  // multiple copy constructors specified
 #  pragma warning(disable : 4702)  // unreachable code
 #  pragma warning(disable : 28251) // Inconsistent annotation for 'new': this instance has no annotations.
 #endif                             // !defined(__clang__)

From 2ba08386156ef25913b1bee170d8fe95aaceb234 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Tue, 28 May 2024 12:20:58 -0700
Subject: [PATCH 004/230] [libc++] [test] Fix portability issues for MSVC
 (#93259)

* Guard `std::__make_from_tuple_impl` tests with `#ifdef _LIBCPP_VERSION` and `LIBCPP_STATIC_ASSERT`.
* Change `_LIBCPP_CONSTEXPR_SINCE_CXX20` to `TEST_CONSTEXPR_CXX20`.
+ Other functions in `variant.swap/swap.pass.cpp` were already using the proper test macro.
* Mark `what` as `[[maybe_unused]]` when used by `TEST_LIBCPP_REQUIRE`.
  + This updates one occurrence in `libcxx/test/libcxx` for consistency.
* Windows `_putenv_s()` takes 2 arguments, not 3.
  + See MSVC documentation: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+ POSIX `setenv()` takes `int overwrite`, but Windows `_putenv_s()` always overwrites.
* Avoid non-Standard zero-length arrays.
  + Followup to #74183 and #79792.
* Add `operator++()` to `unsized_it`.
+ The Standard requires this due to [N4981][] [move.iter.requirements]/1 "The template parameter `Iterator` shall
  either meet the *Cpp17InputIterator* requirements ([input.iterators])
  or model `input_iterator` ([iterator.concept.input])."
+ MSVC's STL requires this because it has a strengthened exception
  specification in `move_iterator` that inspects the underlying iterator's
  increment operator.
* `uniform_int_distribution` forbids `int8_t`/`uint8_t`.
  + See [N4981][] [rand.req.genl]/1.5. MSVC's STL enforces this.
+ Note that when changing the distribution's `IntType`, we need to be
  careful to preserve the original value range of `[0, max_input]`.
* fstreams are constructible from `const fs::path::value_type*` on wide systems.
  + See [ifstream.cons], [ofstream.cons], [fstream.cons].
* In `msvc_stdlib_force_include.h`, map `_HAS_CXX23` to `TEST_STD_VER` 23 instead of 99.
+ On 2023-05-23, https://github.com/llvm/llvm-project/commit/71400505ca048507e827013eb1ea0bc863525cab
  started recognizing 23 as a distinct value.
* Fix test name typo: `destory_elements.pass.cpp` => `destroy_elements.pass.cpp`

[N4981]: https://wg21.link/N4981
---
 .../time.zone.db.tzdb/locate_zone.pass.cpp    |  2 +-
 .../ranges.contains_subrange.pass.cpp         | 25 +++++++++--------
 ...nts.pass.cpp => destroy_elements.pass.cpp} |  0
 .../fstreams/fstream.cons/path.pass.cpp       |  2 +-
 .../fstreams/ifstream.cons/path.pass.cpp      |  2 +-
 .../fstreams/ofstream.cons/path.pass.cpp      |  2 +-
 .../sized_sentinel.compile.pass.cpp           |  1 +
 .../numeric.ops/numeric.ops.gcd/gcd.pass.cpp  |  9 ++++--
 .../time.zone.db.access/current_zone.pass.cpp |  2 +-
 .../time.zone.db.access/locate_zone.pass.cpp  |  2 +-
 .../time.zone.db.tzdb/current_zone.pass.cpp   |  2 +-
 .../time.zone.db.tzdb/locate_zone.pass.cpp    |  2 +-
 .../tuple.apply/make_from_tuple.pass.cpp      | 28 ++++++++++---------
 .../variant.swap/swap.pass.cpp                |  2 +-
 .../test/support/msvc_stdlib_force_include.h  |  2 +-
 15 files changed, 45 insertions(+), 38 deletions(-)
 rename libcxx/test/std/containers/sequences/vector/vector.modifiers/{destory_elements.pass.cpp => destroy_elements.pass.cpp} (100%)

diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index 3ee213358f3524..08c682964c3745 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -73,7 +73,7 @@ L link link_to_link
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
index 761691c2afdcb9..890ac23fff8327 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
@@ -24,6 +24,7 @@
 //                                              Proj1 proj1 = {}, Proj2 proj2 = {});                 // since C++23
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <concepts>
 #include <ranges>
@@ -130,10 +131,10 @@ constexpr void test_iterators() {
   }
 
   { // range has zero length
-    int a[]       = {};
-    int p[]       = {3, 4, 2};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
+    std::array<int, 0> a = {};
+    int p[]              = {3, 4, 2};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(!ret);
@@ -145,10 +146,10 @@ constexpr void test_iterators() {
   }
 
   { // subrange has zero length
-    int a[]       = {3, 4, 2};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    int a[]              = {3, 4, 2};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
@@ -160,10 +161,10 @@ constexpr void test_iterators() {
   }
 
   { // range and subrange both have zero length
-    int a[]       = {};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    std::array<int, 0> a = {};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
similarity index 100%
rename from libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp
rename to libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
index 5edf22eaacf31f..d6bb56d9b78b79 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::fstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::fstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
index 2f27fd8e6e93d3..792b65615679a7 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
@@ -38,7 +38,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ifstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ifstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
index e55adfd83fc3c7..602bdadd85813f 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ofstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ofstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
index cb49086dd6802b..998b13ed494552 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
@@ -21,6 +21,7 @@ struct unsized_it {
   using difference_type = std::ptrdiff_t;
 
   value_type& operator*() const;
+  unsized_it& operator++();
   bool operator==(const unsized_it&) const;
   difference_type operator-(const unsized_it&) const { return 0; }
 };
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index bf40b174b209cc..6a9ec1a2ffec24 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <limits>
 #include <random>
 #include <type_traits>
 
@@ -69,12 +70,14 @@ T basic_gcd(T m, T n) {
 template <typename Input>
 void do_fuzzy_tests() {
   std::mt19937 gen(1938);
-  std::uniform_int_distribution<Input> distrib;
+  using DistIntType         = std::conditional_t<sizeof(Input) == 1, int, Input>; // See N4981 [rand.req.genl]/1.5
+  constexpr Input max_input = std::numeric_limits<Input>::max();
+  std::uniform_int_distribution<DistIntType> distrib(0, max_input);
 
   constexpr int nb_rounds = 10000;
   for (int i = 0; i < nb_rounds; ++i) {
-    Input n = distrib(gen);
-    Input m = distrib(gen);
+    Input n = static_cast<Input>(distrib(gen));
+    Input m = static_cast<Input>(distrib(gen));
     assert(std::gcd(n, m) == basic_gcd(n, m));
   }
 }
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
index 2c43e121613c77..f31a679dd6214f 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
@@ -32,7 +32,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
index 4d600fcdf40e3f..8dd895fd21814f 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
@@ -40,7 +40,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
index e6497e26323ce6..98509c298ebcb8 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
@@ -34,7 +34,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index f929dafcc96838..08ce48dfd0edb2 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -42,7 +42,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
index d7374351afa8bf..accb601dd00365 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
@@ -209,6 +209,7 @@ template <class T, class Tuple>
 static constexpr bool can_make_from_tuple =
     std::is_same_v<decltype(test_make_from_tuple<T, Tuple>(T{}, Tuple{})), uint8_t>;
 
+#ifdef _LIBCPP_VERSION
 template <class T, class Tuple>
 auto test_make_from_tuple_impl(T&&, Tuple&& t)
     -> decltype(std::__make_from_tuple_impl<T>(
@@ -224,6 +225,7 @@ uint32_t test_make_from_tuple_impl(...) {
 template <class T, class Tuple>
 static constexpr bool can_make_from_tuple_impl =
     std::is_same_v<decltype(test_make_from_tuple_impl<T, Tuple>(T{}, Tuple{})), uint8_t>;
+#endif // _LIBCPP_VERSION
 
 struct A {
   int a;
@@ -263,23 +265,23 @@ static_assert(can_make_from_tuple<float, std::tuple<double>>);
 // Test std::__make_from_tuple_impl constraints.
 
 // reinterpret_cast
-static_assert(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
-static_assert(can_make_from_tuple_impl<A*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<A*, std::tuple<A*>>);
 
 // const_cast
-static_assert(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
-static_assert(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
-static_assert(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
-static_assert(can_make_from_tuple_impl<char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
 
 // static_cast
-static_assert(!can_make_from_tuple_impl<int, std::tuple<D>>);
-static_assert(!can_make_from_tuple_impl<D, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<long, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<double, std::tuple<float>>);
-static_assert(can_make_from_tuple_impl<float, std::tuple<double>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int, std::tuple<D>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<D, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<long, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<double, std::tuple<float>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<float, std::tuple<double>>);
 
 } // namespace LWG3528
 
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index db05691c55818c..039a2373348c4e 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -516,7 +516,7 @@ constexpr void test_swap_sfinae() {
   }
 }
 
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() {
+TEST_CONSTEXPR_CXX20 void test_swap_noexcept() {
   {
     using V = std::variant<int, NothrowMoveable>;
     static_assert(std::is_swappable_v<V> && has_swap_member<V>(), "");
diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index 35783c1607b0e0..785670224c3b18 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -90,7 +90,7 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 #include <version>
 
 #if _HAS_CXX23
-#  define TEST_STD_VER 99
+#  define TEST_STD_VER 23
 #elif _HAS_CXX20
 #  define TEST_STD_VER 20
 #elif _HAS_CXX17

From bc247ba113543b07fcff769ab616cf9509eb2794 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 12:42:31 -0700
Subject: [PATCH 005/230] [memprof] Rename memprof-merge-v0.test to
 memprof-merge-versions.test (#93602)

Despite the name, the test is used to test merge/show roundtrips for
different MemProf versions.  This patch renames the test to match the
reality.
---
 .../{memprof-merge-v0.test => memprof-merge-versions.test}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/tools/llvm-profdata/{memprof-merge-v0.test => memprof-merge-versions.test} (100%)

diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
similarity index 100%
rename from llvm/test/tools/llvm-profdata/memprof-merge-v0.test
rename to llvm/test/tools/llvm-profdata/memprof-merge-versions.test

From 1c3a3f0e79a9c6a7c1c4a71c43a9eab783c3b266 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 12:49:42 -0700
Subject: [PATCH 006/230] [LegalizeTypes] Use VP_AND and VP_SHL/VP_SRA to
 promote operands fo VP arithmetic. (#92799)

This adds VPSExtPromotedInteger and VPZExtPromotedInteger and uses them
to promote many arithmetic operations.

VPSExtPromotedInteger uses a shift pair because we don't have
VP_SIGN_EXTEND_INREG yet.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 113 ++++++++++++------
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  21 ++++
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |  12 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |   6 +-
 .../RISCV/rvv/fixed-vectors-vdiv-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vdivu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vrem-vp.ll        |   9 +-
 .../RISCV/rvv/fixed-vectors-vremu-vp.ll       |   5 +-
 .../RISCV/rvv/fixed-vectors-vshl-vp.ll        |   3 +-
 .../RISCV/rvv/fixed-vectors-vsra-vp.ll        |   7 +-
 .../RISCV/rvv/fixed-vectors-vsrl-vp.ll        |   5 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |  40 +++----
 llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll        |  10 +-
 llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll       |   5 +-
 llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll        |   2 +-
 llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll        |   7 +-
 llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll        |   5 +-
 27 files changed, 201 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8fda35f0086329..12f1d005249d60 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -646,18 +646,21 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
     }
   }
 
-  // Zero extend to the promoted type and do the count there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-
   // Subtract off the extra leading bits in the bigger type.
   SDValue ExtractLeadingBits = DAG.getConstant(
       NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    // Zero extend to the promoted type and do the count there.
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(ISD::SUB, dl, NVT,
                        DAG.getNode(N->getOpcode(), dl, NVT, Op),
                        ExtractLeadingBits);
+  }
+
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
   return DAG.getNode(ISD::VP_SUB, dl, NVT,
                      DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
                      ExtractLeadingBits, Mask, EVL);
@@ -681,11 +684,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
   }
 
   // Zero extend to the promoted type and do the count or parity there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
-  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op,
-                     N->getOperand(1), N->getOperand(2));
+  }
+
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, Mask,
+                     EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -1335,12 +1343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SHL)
+  if (N->getOpcode() != ISD::VP_SHL) {
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
+
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
@@ -1364,27 +1379,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
-  // Sign extend the input.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Sign extend the input.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = SExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // Sign extend the input.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPSExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
-  // Zero extend the input.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Zero extend the input.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  // Zero extend the input.
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPZExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
@@ -1400,27 +1427,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
-  // The input value must be properly sign extended.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRA)
+  if (N->getOpcode() != ISD::VP_SRA) {
+    // The input value must be properly sign extended.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly sign extended.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
-  // The input value must be properly zero extended.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRL)
+  if (N->getOpcode() != ISD::VP_SRL) {
+    // The input value must be properly zero extended.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly zero extended.
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
@@ -1487,7 +1530,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
   SDValue Mask = N->getOperand(3);
   SDValue EVL = N->getOperand(4);
   if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger)
-    Amt = ZExtPromotedInteger(Amt);
+    Amt = VPZExtPromotedInteger(Amt, Mask, EVL);
   EVT AmtVT = Amt.getValueType();
 
   SDLoc DL(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d925089d5689f1..ba3c7582d5a8a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -275,6 +275,27 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPSExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    // FIXME: Add VP_SIGN_EXTEND_INREG.
+    EVT VT = Op.getValueType();
+    unsigned BitsDiff = VT.getScalarSizeInBits() - OldVT.getScalarSizeInBits();
+    SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl);
+    SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShiftCst, Mask, EVL);
+    return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShiftCst, Mask, EVL);
+  }
+
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPZExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getVPZeroExtendInReg(Op, Mask, EVL, dl, OldVT);
+  }
+
   // Promote the given operand V (vector or scalar) according to N's specific
   // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
   // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index fff280c005b542..df413b878172bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2574,9 +2574,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-LABEL: vp_ctlz_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2593,9 +2592,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
@@ -2607,9 +2605,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2624,9 +2621,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index e3c53212e91b77..b5cafe410ae8d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2549,9 +2549,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-LABEL: vp_ctpop_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -2576,9 +2575,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctpop.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
index 29f8eaba900527..e3c7d02462cc7f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vdiv_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
index 3f8eb0ff276b7f..03bd85bf5e69e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vdivu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vdivu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.udiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index 9789afda9344ad..0b0d758ad8ded8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smax.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmax_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 36b0a4642b6169..98e630a0e59e5a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vmaxu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vmaxu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index adb0a30f34d35a..a6e3764b37550d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smin.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmin_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index 671ce82d4ae795..c59b65edd1ec10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vminu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vminu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
index 4bbbad5ed0e0e8..ff8a63e371c8ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.srem.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vrem_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.srem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
index ee11307bddc88c..b5eec4142c7824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vremu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vremu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.urem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
index c4b7c1f2f19f0f..16a0fddfa98277 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
@@ -10,9 +10,8 @@ define <8 x i7> @vsll_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsll_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.shl.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
index 7ea5b1f0b505a3..180fafa9659b1c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
@@ -10,11 +10,10 @@ define <8 x i7> @vsra_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsra_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.ashr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
index 9f9d4af0cc2f3f..22f04803eadd74 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vsrl_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsrl_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.lshr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index bc5617957d7d08..2c5a3dfffc2cfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -1282,18 +1282,17 @@ define <vscale x 1 x i9> @fshr_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshr_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vadd.vi v10, v10, 7, v0.t
 ; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
 ; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
@@ -1306,18 +1305,17 @@ define <vscale x 1 x i9> @fshl_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshl_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vnot.v v11, v10, v0.t
-; CHECK-NEXT:    vand.vi v11, v11, 15, v0.t
-; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
+; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i9> @llvm.vp.fshl.nxv1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b, <vscale x 1 x i9> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -1330,15 +1328,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshr.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshr_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshr_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
   %trunca = call <vscale x 1 x i4> @llvm.vp.trunc.nxv1i4.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i1> %m, i32 zeroext %evl)
@@ -1353,15 +1350,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshl.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshl_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshl_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 26089706cf99ef..a4b7ca7f39768f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.sdiv.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vdiv_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vdiv.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index f41b885a66eaae..67c3f9dbf2869a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vdivu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vdivu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 8a76467986620c..c15caa31bb0986 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smax.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmax_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 1c74887c1b20fb..df494f8af7387c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vmaxu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vmaxu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 1c71242c3c7d79..794a21c7c6abac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smin.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmin_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 6d89a9777cf917..d54de281a7fd28 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vminu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vminu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index cf85fd827b51f1..2ef96f4b3896fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.srem.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vrem_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vrem.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 61bdd5b8d3c8a7..1f1ed4a1269acb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vremu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vremu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
index c04d5ea2da3c1b..380835494ed17d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
@@ -12,8 +12,8 @@ define <vscale x 8 x i7> @vsll_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
index 632c4db5c5bb57..cff8cc710d21f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
@@ -9,13 +9,14 @@ declare <vscale x 8 x i7> @llvm.vp.ashr.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vsra_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vsra_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
index ec5b7f3faf7ca8..ff6771b643031f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vsrl_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-LABEL: vsrl_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0

From 0e96eebc7f681a7ce41f35909e609c7c61a11455 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Tue, 28 May 2024 12:52:45 -0700
Subject: [PATCH 007/230] [msan] Reland: Increase k num stack origin descrs
 (limited to non-PowerPC) (#93117)

The original pull request
(https://github.com/llvm/llvm-project/pull/92838) was reverted due to a
PowerPC buildbot breakage
(https://github.com/llvm/llvm-project/commit/df626dd11c360c58eddae813ce6a0524d0a53696).
This reland limits the scope of the change to non-PowerPC platforms. I
am unaware of any PowerPC use cases that would benefit from a larger
kNumStackOriginDescrs constant.

Original CL description: This increases the constant size of
kNumStackOriginDescrs to 4M (64GB of BSS across two arrays), which ought
to be enough for anybody.

This is the easier alternative suggested by eugenis@ in
https://github.com/llvm/llvm-project/pull/92826.
---
 compiler-rt/lib/msan/msan.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index a2fc27de1901b4..9375e27d4f4d24 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -100,7 +100,17 @@ int msan_report_count = 0;
 
 // Array of stack origins.
 // FIXME: make it resizable.
-static const uptr kNumStackOriginDescrs = 1024 * 1024;
+// Although BSS memory doesn't cost anything until used, it is limited to 2GB
+// in some configurations (e.g., "relocation R_X86_64_PC32 out of range:
+// ... is not in [-2147483648, 2147483647]; references section '.bss'").
+// We use kNumStackOriginDescrs * (sizeof(char*) + sizeof(uptr)) == 64MB.
+#ifdef SANITIZER_PPC
+// soft_rss_limit test (release_origin.c) fails on PPC if kNumStackOriginDescrs
+// is too high
+static const uptr kNumStackOriginDescrs = 1 * 1024 * 1024;
+#else
+static const uptr kNumStackOriginDescrs = 4 * 1024 * 1024;
+#endif  // SANITIZER_PPC
 static const char *StackOriginDescr[kNumStackOriginDescrs];
 static uptr StackOriginPC[kNumStackOriginDescrs];
 static atomic_uint32_t NumStackOriginDescrs;

From d9dec109375ded13d61da20877c399fb8fbb877d Mon Sep 17 00:00:00 2001
From: Lucile Rose Nihlen <luci.the.rose@gmail.com>
Date: Tue, 28 May 2024 19:53:21 +0000
Subject: [PATCH 008/230] [ci] limit parallel windows compile jobs to 24
 (#93329)

This is an experiment to see if we can prevent some of the compiler OOMs
happening without unduly impacting the Windows build latency.
---
 .ci/monolithic-windows.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 4fd88ea81c84a8..91e719c52d4363 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -44,6 +44,8 @@ pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
 # see https://github.com/llvm/llvm-project/pull/82393 and
 # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
 # for further information.
+# We limit the number of parallel compile jobs to 24 control memory
+# consumption and improve build reliability.
 cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
@@ -58,7 +60,9 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
-      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO"
+      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
+      -D LLVM_PARALLEL_COMPILE_JOBS=16 \
+      -D LLVM_PARALLEL_LINK_JOBS=4
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.

From c96860aea2c77392bad16f1c4f55014164669de3 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 28 May 2024 22:09:34 +0200
Subject: [PATCH 009/230] [clang-tidy] Optimize realpath in
 readability-identifier-naming (#92659)

- Reduce disk IO usage by adding cache to an realpath introduced by
#81985
---
 .../clang-tidy/readability/IdentifierNamingCheck.cpp | 12 ++++++++++--
 .../clang-tidy/readability/IdentifierNamingCheck.h   |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index c3208392df1566..828f13805a6980 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -1414,13 +1414,21 @@ IdentifierNamingCheck::getDiagInfo(const NamingCheckId &ID,
                   }};
 }
 
+StringRef IdentifierNamingCheck::getRealFileName(StringRef FileName) const {
+  auto Iter = RealFileNameCache.try_emplace(FileName);
+  SmallString<256U> &RealFileName = Iter.first->getValue();
+  if (!Iter.second)
+    return RealFileName;
+  llvm::sys::fs::real_path(FileName, RealFileName);
+  return RealFileName;
+}
+
 const IdentifierNamingCheck::FileStyle &
 IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   if (!GetConfigPerFile)
     return *MainFileStyle;
 
-  SmallString<128> RealFileName;
-  llvm::sys::fs::real_path(FileName, RealFileName);
+  StringRef RealFileName = getRealFileName(FileName);
   StringRef Parent = llvm::sys::path::parent_path(RealFileName);
   auto Iter = NamingStylesCache.find(Parent);
   if (Iter != NamingStylesCache.end())
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 27c8e4bc768c40..646ec0eac8dd1c 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -205,6 +205,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
                        const NamingCheckFailure &Failure) const override;
 
   const FileStyle &getStyleForFile(StringRef FileName) const;
+  StringRef getRealFileName(StringRef FileName) const;
 
   /// Find the style kind of a field in an anonymous record.
   StyleKind findStyleKindForAnonField(
@@ -222,6 +223,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
   /// Stores the style options as a vector, indexed by the specified \ref
   /// StyleKind, for a given directory.
   mutable llvm::StringMap<FileStyle> NamingStylesCache;
+  mutable llvm::StringMap<SmallString<256U>> RealFileNameCache;
   FileStyle *MainFileStyle;
   ClangTidyContext *Context;
   const bool GetConfigPerFile;

From 0aacef3abc41cfc8efb5f1b9483bc37599352a59 Mon Sep 17 00:00:00 2001
From: Mattan Elkaim <73639004+mattanelkaim@users.noreply.github.com>
Date: Tue, 28 May 2024 23:19:01 +0300
Subject: [PATCH 010/230] [clang-tidy][NFC] Update identifier-length.rst
 (#93467)

Swapped code blocks of parameter and variable, which have been confused
(in a clang-tidy doc file)
---
 .../checks/readability/identifier-length.rst           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
index 44d97f7b363bff..271970c292c8fa 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
@@ -28,10 +28,7 @@ The following options are described below:
 
     .. code-block:: c++
 
-         int doubler(int x)   // warns that x is too short
-         {
-            return 2 * x;
-         }
+      int i = 42;    // warns that 'i' is too short
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.
@@ -50,7 +47,10 @@ The following options are described below:
 
     .. code-block:: c++
 
-      int i = 42;    // warns that 'i' is too short
+         int doubler(int x)   // warns that x is too short
+         {
+            return 2 * x;
+         }
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.

From c108c1e94580d70e2be66172ab4397fcff004376 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 13:26:36 -0700
Subject: [PATCH 011/230] [WebAssembly] Rename old EH tests to *-legacy
 (#93585)

I think test files for the legacy and the new EH (exnref) are better be
separate, and I'd like to use the current test file names for the new
EH, rather than keeping the current files and naming the new ones as
`-new` or something.
---
 .../WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} | 0
 .../CodeGen/WebAssembly/{exception.ll => exception-legacy.ll}     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/WebAssembly/{cfg-stackify-eh.ll => cfg-stackify-eh-legacy.ll} (100%)
 rename llvm/test/CodeGen/WebAssembly/{exception.ll => exception-legacy.ll} (100%)

diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/exception.ll
rename to llvm/test/CodeGen/WebAssembly/exception-legacy.ll

From 9e89d107a6ec2ade15eddb549fa473cf09bf230e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 28 May 2024 13:30:00 -0700
Subject: [PATCH 012/230] [memprof] Add MemProf format Version 3 (#93608)

This patch adds Version 3 for development purposes.  For now, this
patch adds V3 as a copy of V2.

For the most part, this patch adds "case Version3:" wherever "case
Version2:" appears.  One exception is writeMemProfV3, which is copied
from writeMemProfV2 but updated to write out memprof::Version3 to the
MemProf header.  We'll incrementally modify writeMemProfV3 in
subsequent patches.
---
 llvm/include/llvm/ProfileData/MemProf.h       |  4 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  4 +-
 llvm/lib/ProfileData/InstrProfWriter.cpp      | 52 +++++++++++++++++++
 llvm/lib/ProfileData/MemProf.cpp              |  4 ++
 .../llvm-profdata/memprof-merge-versions.test |  6 +++
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  3 +-
 6 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 17cef15344285b..d44a2d1e2fb117 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -28,10 +28,12 @@ enum IndexedVersion : uint64_t {
   Version1 = 1,
   // Version 2: Added a call stack table.
   Version2 = 2,
+  // Version 3: Under development.
+  Version3 = 3,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
-constexpr uint64_t MaximumSupportedVersion = Version2;
+constexpr uint64_t MaximumSupportedVersion = Version3;
 
 // Verify that the minimum and maximum satisfy the obvious constraint.
 static_assert(MinimumSupportedVersion <= MaximumSupportedVersion);
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 836206a4fd86e2..798236c295194a 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1212,7 +1212,8 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
   const uint64_t FirstWord =
       support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
-  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) {
+  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 ||
+      FirstWord == memprof::Version3) {
     // Everything is good.  We can proceed to deserialize the rest.
     Version = static_cast<memprof::IndexedVersion>(FirstWord);
   } else if (FirstWord >= 24) {
@@ -1559,6 +1560,7 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
            "MemProfCallStackTable must not be available");
     return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable);
   case memprof::Version2:
+  case memprof::Version3:
     assert(MemProfFrameTable && "MemProfFrameTable must be available");
     assert(MemProfCallStackTable && "MemProfCallStackTable must be available");
     return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable,
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b67a9700b680ab..b16714ae8b9a2d 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -617,6 +617,56 @@ static Error writeMemProfV2(ProfOStream &OS,
   return Error::success();
 }
 
+// Write out MemProf Version3 as follows:
+// uint64_t Version
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
+// uint64_t FramePayloadOffset = Offset for the frame payload
+// uint64_t FrameTableOffset = FrameTableGenerator.Emit
+// uint64_t CallStackPayloadOffset = Offset for the call stack payload
+// uint64_t CallStackTableOffset = CallStackTableGenerator.Emit
+// uint64_t Num schema entries
+// uint64_t Schema entry 0
+// uint64_t Schema entry 1
+// ....
+// uint64_t Schema entry N - 1
+// OnDiskChainedHashTable MemProfRecordData
+// OnDiskChainedHashTable MemProfFrameData
+// OnDiskChainedHashTable MemProfCallStackData
+static Error writeMemProfV3(ProfOStream &OS,
+                            memprof::IndexedMemProfData &MemProfData,
+                            bool MemProfFullSchema) {
+  OS.write(memprof::Version3);
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack payload offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack table offset.
+
+  auto Schema = memprof::getHotColdSchema();
+  if (MemProfFullSchema)
+    Schema = memprof::getFullSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData,
+                                                   &Schema, memprof::Version3);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData);
+
+  uint64_t CallStackPayloadOffset = OS.tell();
+  uint64_t CallStackTableOffset =
+      writeMemProfCallStacks(OS, MemProfData.CallStackData);
+
+  uint64_t Header[] = {
+      RecordTableOffset,      FramePayloadOffset,   FrameTableOffset,
+      CallStackPayloadOffset, CallStackTableOffset,
+  };
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
 // Write out the MemProf data in a requested version.
 static Error writeMemProf(ProfOStream &OS,
                           memprof::IndexedMemProfData &MemProfData,
@@ -629,6 +679,8 @@ static Error writeMemProf(ProfOStream &OS,
     return writeMemProfV1(OS, MemProfData);
   case memprof::Version2:
     return writeMemProfV2(OS, MemProfData, MemProfFullSchema);
+  case memprof::Version3:
+    return writeMemProfV3(OS, MemProfData, MemProfFullSchema);
   }
 
   return make_error<InstrProfError>(
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 89afe7c39027c6..2f0e53736c82e5 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -52,6 +52,7 @@ size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -95,6 +96,7 @@ size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -149,6 +151,7 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
     serializeV0(*this, Schema, OS);
     return;
   case Version2:
+  case Version3:
     serializeV2(*this, Schema, OS);
     return;
   }
@@ -239,6 +242,7 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
   case Version1:
     return deserializeV0(Schema, Ptr);
   case Version2:
+  case Version3:
     return deserializeV2(Schema, Ptr);
   }
   llvm_unreachable("unsupported MemProf version");
diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
index 28f65e0781bc63..aa7d0329425dc5 100644
--- a/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
+++ b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
@@ -19,6 +19,12 @@ RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2
 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
 For now we only check the validity of the instrumented profile since we don't
 have a way to display the contents of the memprof indexed format yet.
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 28c3afa1016473..fae6d1e989ab5a 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -306,7 +306,8 @@ cl::opt<memprof::IndexedVersion> MemProfVersionRequested(
     cl::init(memprof::Version0),
     cl::values(clEnumValN(memprof::Version0, "0", "version 0"),
                clEnumValN(memprof::Version1, "1", "version 1"),
-               clEnumValN(memprof::Version2, "2", "version 2")));
+               clEnumValN(memprof::Version2, "2", "version 2"),
+               clEnumValN(memprof::Version3, "3", "version 3")));
 
 cl::opt<bool> MemProfFullSchema(
     "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand),

From 193e9007ef0bef6c881ab26746221f22ec674447 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 28 May 2024 13:18:46 -0700
Subject: [PATCH 013/230] [OpenACC][NFC] Fix begin loc and split it from the
 directive location

I discovered while working on something else that we were using the
location of the directive name as the 'beginloc' which caused some
problems in a few places.  This patch makes it so our beginloc is the
'#' as we originally designed, and then adds a DirectiveLoc concept to a
construct for use diagnosing the name.
---
 clang/include/clang/AST/StmtOpenACC.h     | 32 ++++++++++++++---------
 clang/include/clang/Parse/Parser.h        |  1 +
 clang/include/clang/Sema/SemaOpenACC.h    |  3 ++-
 clang/lib/AST/StmtOpenACC.cpp             | 13 +++++----
 clang/lib/Parse/ParseOpenACC.cpp          | 19 +++++++-------
 clang/lib/Sema/SemaOpenACC.cpp            |  7 ++---
 clang/lib/Sema/TreeTransform.h            |  9 ++++---
 clang/lib/Serialization/ASTReaderStmt.cpp |  1 +
 clang/lib/Serialization/ASTWriterStmt.cpp |  1 +
 9 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index b706864798baaf..04daf511f58713 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -31,6 +31,8 @@ class OpenACCConstructStmt : public Stmt {
   /// The location of the directive statement, from the '#' to the last token of
   /// the directive.
   SourceRange Range;
+  /// The location of the directive name.
+  SourceLocation DirectiveLoc;
 
   /// The list of clauses.  This is stored here as an ArrayRef, as this is the
   /// most convienient place to access the list, however the list itself should
@@ -39,8 +41,9 @@ class OpenACCConstructStmt : public Stmt {
 
 protected:
   OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K,
-                       SourceLocation Start, SourceLocation End)
-      : Stmt(SC), Kind(K), Range(Start, End) {}
+                       SourceLocation Start, SourceLocation DirectiveLoc,
+                       SourceLocation End)
+      : Stmt(SC), Kind(K), Range(Start, End), DirectiveLoc(DirectiveLoc) {}
 
   // Used only for initialization, the leaf class can initialize this to
   // trailing storage.
@@ -59,6 +62,7 @@ class OpenACCConstructStmt : public Stmt {
 
   SourceLocation getBeginLoc() const { return Range.getBegin(); }
   SourceLocation getEndLoc() const { return Range.getEnd(); }
+  SourceLocation getDirectiveLoc() const { return DirectiveLoc; }
   ArrayRef<const OpenACCClause *> clauses() const { return Clauses; }
 
   child_range children() {
@@ -81,9 +85,11 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt {
 
 protected:
   OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K,
-                                 SourceLocation Start, SourceLocation End,
-                                 Stmt *AssocStmt)
-      : OpenACCConstructStmt(SC, K, Start, End), AssociatedStmt(AssocStmt) {}
+                                 SourceLocation Start,
+                                 SourceLocation DirectiveLoc,
+                                 SourceLocation End, Stmt *AssocStmt)
+      : OpenACCConstructStmt(SC, K, Start, DirectiveLoc, End),
+        AssociatedStmt(AssocStmt) {}
 
   void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; }
   Stmt *getAssociatedStmt() { return AssociatedStmt; }
@@ -126,10 +132,10 @@ class OpenACCComputeConstruct final
   friend class ASTStmtReader;
   friend class ASTContext;
   OpenACCComputeConstruct(unsigned NumClauses)
-      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass,
-                                       OpenACCDirectiveKind::Invalid,
-                                       SourceLocation{}, SourceLocation{},
-                                       /*AssociatedStmt=*/nullptr) {
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCComputeConstructClass, OpenACCDirectiveKind::Invalid,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {
     // We cannot send the TrailingObjects storage to the base class (which holds
     // a reference to the data) until it is constructed, so we have to set it
     // separately here.
@@ -141,11 +147,11 @@ class OpenACCComputeConstruct final
   }
 
   OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start,
-                          SourceLocation End,
+                          SourceLocation DirectiveLoc, SourceLocation End,
                           ArrayRef<const OpenACCClause *> Clauses,
                           Stmt *StructuredBlock)
       : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
-                                       End, StructuredBlock) {
+                                       DirectiveLoc, End, StructuredBlock) {
     assert(isOpenACCComputeDirectiveKind(K) &&
            "Only parallel, serial, and kernels constructs should be "
            "represented by this type");
@@ -169,8 +175,8 @@ class OpenACCComputeConstruct final
                                               unsigned NumClauses);
   static OpenACCComputeConstruct *
   Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
-         SourceLocation EndLoc, ArrayRef<const OpenACCClause *> Clauses,
-         Stmt *StructuredBlock);
+         SourceLocation DirectiveLoc, SourceLocation EndLoc,
+         ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock);
 
   Stmt *getStructuredBlock() { return getAssociatedStmt(); }
   const Stmt *getStructuredBlock() const {
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 00b475e5b42824..d054b8cf0d2405 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3659,6 +3659,7 @@ class Parser : public CodeCompletionHandler {
   struct OpenACCDirectiveParseInfo {
     OpenACCDirectiveKind DirKind;
     SourceLocation StartLoc;
+    SourceLocation DirLoc;
     SourceLocation EndLoc;
     SmallVector<OpenACCClause *> Clauses;
     // TODO OpenACC: As we implement support for the Atomic, Routine, Cache, and
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 6f69fa08939b82..66144de4340a8a 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -379,7 +379,7 @@ class SemaOpenACC : public SemaBase {
   /// Called after the construct has been parsed, but clauses haven't been
   /// parsed.  This allows us to diagnose not-implemented, as well as set up any
   /// state required for parsing the clauses.
-  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation StartLoc);
+  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc);
 
   /// Called after the directive, including its clauses, have been parsed and
   /// parsing has consumed the 'annot_pragma_openacc_end' token. This DOES
@@ -400,6 +400,7 @@ class SemaOpenACC : public SemaBase {
   /// declaration group or associated statement.
   StmtResult ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                    SourceLocation StartLoc,
+                                   SourceLocation DirLoc,
                                    SourceLocation EndLoc,
                                    ArrayRef<OpenACCClause *> Clauses,
                                    StmtResult AssocStmt);
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index a381a8dd7b62c3..47899b344c97ab 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -23,15 +23,14 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
   return Inst;
 }
 
-OpenACCComputeConstruct *
-OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K,
-                                SourceLocation BeginLoc, SourceLocation EndLoc,
-                                ArrayRef<const OpenACCClause *> Clauses,
-                                Stmt *StructuredBlock) {
+OpenACCComputeConstruct *OpenACCComputeConstruct::Create(
+    const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
+    SourceLocation DirLoc, SourceLocation EndLoc,
+    ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock) {
   void *Mem = C.Allocate(
       OpenACCComputeConstruct::totalSizeToAlloc<const OpenACCClause *>(
           Clauses.size()));
-  auto *Inst = new (Mem)
-      OpenACCComputeConstruct(K, BeginLoc, EndLoc, Clauses, StructuredBlock);
+  auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc,
+                                                 Clauses, StructuredBlock);
   return Inst;
 }
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index e9c60f76165b68..63afc18783a1f7 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1347,11 +1347,13 @@ void Parser::ParseOpenACCCacheVarList() {
   ParseOpenACCVarList(OpenACCClauseKind::Invalid);
 }
 
-Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
-  SourceLocation StartLoc = getCurToken().getLocation();
+Parser::OpenACCDirectiveParseInfo
+Parser::ParseOpenACCDirective() {
+  SourceLocation StartLoc = ConsumeAnnotationToken();
+  SourceLocation DirLoc = getCurToken().getLocation();
   OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this);
 
-  getActions().OpenACC().ActOnConstruct(DirKind, StartLoc);
+  getActions().OpenACC().ActOnConstruct(DirKind, DirLoc);
 
   // Once we've parsed the construct/directive name, some have additional
   // specifiers that need to be taken care of. Atomic has an 'atomic-clause'
@@ -1390,7 +1392,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
       break;
     case OpenACCDirectiveKind::Wait:
       // OpenACC has an optional paren-wrapped 'wait-argument'.
-      if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed)
+      if (ParseOpenACCWaitArgument(DirLoc, /*IsDirective=*/true).Failed)
         T.skipToEnd();
       else
         T.consumeClose();
@@ -1404,7 +1406,8 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
   }
 
   // Parses the list of clauses, if present, plus set up return value.
-  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, SourceLocation{},
+  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, DirLoc,
+                                      SourceLocation{},
                                       ParseOpenACCClauseList(DirKind)};
 
   assert(Tok.is(tok::annot_pragma_openacc_end) &&
@@ -1421,7 +1424,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
 
@@ -1438,7 +1440,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
   if (getActions().OpenACC().ActOnStartStmtDirective(DirInfo.DirKind,
@@ -1456,6 +1457,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   }
 
   return getActions().OpenACC().ActOnEndStmtDirective(
-      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.EndLoc, DirInfo.Clauses,
-      AssocStmt);
+      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.EndLoc,
+      DirInfo.Clauses, AssocStmt);
 }
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 09d91b31cfe5f9..15239f4f35c39f 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -844,7 +844,7 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) {
 }
 
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
-                                 SourceLocation StartLoc) {
+                                 SourceLocation DirLoc) {
   switch (K) {
   case OpenACCDirectiveKind::Invalid:
     // Nothing to do here, an invalid kind has nothing we can check here.  We
@@ -859,7 +859,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
     // here as these constructs do not take any arguments.
     break;
   default:
-    Diag(StartLoc, diag::warn_acc_construct_unimplemented) << K;
+    Diag(DirLoc, diag::warn_acc_construct_unimplemented) << K;
     break;
   }
 }
@@ -1265,6 +1265,7 @@ bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
 
 StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                               SourceLocation StartLoc,
+                                              SourceLocation DirLoc,
                                               SourceLocation EndLoc,
                                               ArrayRef<OpenACCClause *> Clauses,
                                               StmtResult AssocStmt) {
@@ -1278,7 +1279,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Kernels:
     // TODO OpenACC: Add clauses to the construct here.
     return OpenACCComputeConstruct::Create(
-        getASTContext(), K, StartLoc, EndLoc, Clauses,
+        getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses,
         AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
   llvm_unreachable("Unhandled case in directive handling?");
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index dee335b526991b..765e6177d202d1 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4033,11 +4033,12 @@ class TreeTransform {
 
   StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K,
                                             SourceLocation BeginLoc,
+                                            SourceLocation DirLoc,
                                             SourceLocation EndLoc,
                                             ArrayRef<OpenACCClause *> Clauses,
                                             StmtResult StrBlock) {
-    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, EndLoc,
-                                                     Clauses, StrBlock);
+    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, DirLoc,
+                                                     EndLoc, Clauses, StrBlock);
   }
 
 private:
@@ -11559,8 +11560,8 @@ StmtResult TreeTransform<Derived>::TransformOpenACCComputeConstruct(
       getSema().OpenACC().ActOnAssociatedStmt(C->getDirectiveKind(), StrBlock);
 
   return getDerived().RebuildOpenACCComputeConstruct(
-      C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(),
-      TransformedClauses, StrBlock);
+      C->getDirectiveKind(), C->getBeginLoc(), C->getDirectiveLoc(),
+      C->getEndLoc(), TransformedClauses, StrBlock);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index eac4faff285490..bea2b949891070 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2797,6 +2797,7 @@ void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   (void)Record.readInt();
   S->Kind = Record.readEnum<OpenACCDirectiveKind>();
   S->Range = Record.readSourceRange();
+  S->DirectiveLoc = Record.readSourceLocation();
   Record.readOpenACCClauseList(S->Clauses);
 }
 
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index a44852af97bea3..3c586b270fbf4f 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2847,6 +2847,7 @@ void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   Record.push_back(S->clauses().size());
   Record.writeEnum(S->Kind);
   Record.AddSourceRange(S->Range);
+  Record.AddSourceLocation(S->DirectiveLoc);
   Record.writeOpenACCClauseList(S->clauses());
 }
 

From 5a23d31c5033dcb41d374692ed26d87ed8e2665a Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Tue, 28 May 2024 16:41:53 -0400
Subject: [PATCH 014/230] [Sample Profile] Check hot callsite threshold when
 inlining a function with a sample profile (#93286)

Currently if a callsite is hot as determined by the sample profile, it
is unconditionally inlined barring invalid cases (such as recursion).
Inline cost check should still apply because a function's hotness and
its inline cost are two different things.
For example if a function is calling another very large function
multiple times (at different code paths), the large function should not
be inlined even if its hot.
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |  7 ++-
 .../Inputs/inline-hot-callsite-threshold.prof |  3 +
 .../inline-hot-callsite-threshold.ll          | 61 +++++++++++++++++++
 .../SampleProfile/pseudo-probe-inline.ll      |  2 +-
 llvm/test/Transforms/SampleProfile/remarks.ll |  4 +-
 5 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0920179fb76b73..92ad4c34da6e7e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1391,10 +1391,11 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
       return InlineCost::getAlways("preinliner");
   }
 
-  // For old FDO inliner, we inline the call site as long as cost is not
-  // "Never". The cost-benefit check is done earlier.
+  // For old FDO inliner, we inline the call site if it is below hot threshold,
+  // even if the function is hot based on sample profile data. This is to
+  // prevent huge functions from being inlined.
   if (!CallsitePrioritizedInline) {
-    return InlineCost::get(Cost.getCost(), INT_MAX);
+    return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold);
   }
 
   // Otherwise only use the cost from call analyzer, but overwite threshold with
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
new file mode 100644
index 00000000000000..d1c0408210f498
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
@@ -0,0 +1,3 @@
+foo:100:100
+ 1: bar:100
+  1:100
diff --git a/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
new file mode 100644
index 00000000000000..914ab4f1e3da58
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=100 2>&1 | FileCheck %s
+
+; CHECK: remark: a.cc:6:12: 'bar' inlined into 'foo' to match profiling context with (cost={{.*}}, threshold=100)
+; CHECK:     define dso_local noundef i32 @foo(i32 noundef %0)
+; CHECK-NOT:   %2 = tail call noundef i32 @bar(i32 noundef %0)
+; CHECK-NEXT:  %2 = icmp sgt i32 %0, 1
+; CHECK-NEXT:  br i1 %2, label %3, label %bar.exit
+
+; Manually lower cost threshold for hot function inlining, so that the function
+; is not inlined even profile indicates it as hot.
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=1 2>&1 | FileCheck %s --check-prefix=COST
+
+; COST-NOT:  remark
+; COST: define dso_local noundef i32 @foo(i32 noundef %0)
+; COST-NEXT: %2 = tail call noundef i32 @bar(i32 noundef %0)
+
+define dso_local noundef i32 @bar(i32 noundef %0) #0 !dbg !10 {
+  %2 = icmp sgt i32 %0, 1
+  br i1 %2, label %3, label %15
+3:                                                ; preds = %1
+  %4 = add nsw i32 %0, -2
+  %5 = mul i32 %4, %4
+  %6 = add i32 %5, %0
+  %7 = zext nneg i32 %4 to i33
+  %8 = add nsw i32 %0, -3
+  %9 = zext i32 %8 to i33
+  %10 = mul i33 %7, %9
+  %11 = lshr i33 %10, 1
+  %12 = trunc nuw i33 %11 to i32
+  %13 = xor i32 %12, -1
+  %14 = add i32 %6, %13
+  br label %15
+15:                                               ; preds = %3, %1
+  %16 = phi i32 [ 0, %1 ], [ %14, %3 ]
+  ret i32 %16
+}
+
+define dso_local noundef i32 @foo(i32 noundef %0) #1 !dbg !20 {
+  %2 = tail call noundef i32 @bar(i32 noundef %0), !dbg !24
+  ret i32 %2
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "a.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !1, file: !1, line: 1, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!11 = !DIFile(filename: "a.cc", directory: ".")
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !14}
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !11, file: !11, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!23 = !DILocation(line: 0, scope: !20)
+!24 = !DILocation(line: 6, column: 12, scope: !20)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index 18cbd857d97bb2..2cd9abf0e11e94 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -98,7 +98,7 @@ if.end:
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '15'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          foo
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 997e02bb5b5444..9c0143ae65ca77 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -22,7 +22,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21;
+; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=3000) at callsite main:0:21;
 ; CHECK: remark: remarks.cc:9:19: 'rand' inlined into 'main' to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21;
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
@@ -51,7 +51,7 @@
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '130'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          main

From 6a47315a3cb2c6d381809f0ba5c89bd8dcdbcaa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 28 May 2024 22:45:32 +0200
Subject: [PATCH 015/230] [clang-repl] Even more tests create the Interpreter
 and must check host JIT support (#84758)

---
 .../Interpreter/CodeCompletionTest.cpp        | 85 +++++++++++++++++++
 .../Interpreter/IncrementalProcessingTest.cpp |  3 +
 2 files changed, 88 insertions(+)

diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp
index 873fbda32f0579..72c02c683fafd4 100644
--- a/clang/unittests/Interpreter/CodeCompletionTest.cpp
+++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp
@@ -4,6 +4,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/LineEditor/LineEditor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
@@ -11,6 +12,10 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+#if defined(_AIX) || defined(__MVS__)
+#define CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+#endif
+
 using namespace clang;
 namespace {
 auto CB = clang::IncrementalCompilerBuilder();
@@ -50,7 +55,21 @@ static std::vector<std::string> runComp(clang::Interpreter &MainInterp,
   return Comps;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Sanity) {
+#else
 TEST(CodeCompletionTest, Sanity) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -61,7 +80,13 @@ TEST(CodeCompletionTest, Sanity) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityNoneValid) {
+#else
 TEST(CodeCompletionTest, SanityNoneValid) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -70,7 +95,13 @@ TEST(CodeCompletionTest, SanityNoneValid) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TwoDecls) {
+#else
 TEST(CodeCompletionTest, TwoDecls) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("int apple = 12;"));
@@ -80,14 +111,26 @@ TEST(CodeCompletionTest, TwoDecls) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_CompFunDeclsNoError) {
+#else
 TEST(CodeCompletionTest, CompFunDeclsNoError) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   auto Err = llvm::Error::success();
   auto comps = runComp(*Interp, "void app(", Err);
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TypedDirected) {
+#else
 TEST(CodeCompletionTest, TypedDirected) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("char apple = '2';"));
@@ -119,7 +162,13 @@ TEST(CodeCompletionTest, TypedDirected) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityClasses) {
+#else
 TEST(CodeCompletionTest, SanityClasses) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Apple{};"));
   cantFail(Interp->Parse("void takeApple(Apple &a1){}"));
@@ -142,7 +191,13 @@ TEST(CodeCompletionTest, SanityClasses) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SubClassing) {
+#else
 TEST(CodeCompletionTest, SubClassing) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Fruit {};"));
   cantFail(Interp->Parse("struct Apple : Fruit{};"));
@@ -157,7 +212,13 @@ TEST(CodeCompletionTest, SubClassing) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MultipleArguments) {
+#else
 TEST(CodeCompletionTest, MultipleArguments) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 42;"));
   cantFail(Interp->Parse("char fowl = 'A';"));
@@ -169,7 +230,13 @@ TEST(CodeCompletionTest, MultipleArguments) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Methods) {
+#else
 TEST(CodeCompletionTest, Methods) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -183,7 +250,13 @@ TEST(CodeCompletionTest, Methods) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MethodsInvocations) {
+#else
 TEST(CodeCompletionTest, MethodsInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -197,7 +270,13 @@ TEST(CodeCompletionTest, MethodsInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_NestedInvocations) {
+#else
 TEST(CodeCompletionTest, NestedInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -212,7 +291,13 @@ TEST(CodeCompletionTest, NestedInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TemplateFunctions) {
+#else
 TEST(CodeCompletionTest, TemplateFunctions) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(
       Interp->Parse("template <typename T> T id(T a) { return a;} "));
diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index f3b091b0c0e6cb..9a99ff6262fa3c 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -61,6 +61,9 @@ TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) {
 #else
 TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) {
 #endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
+
   std::vector<const char *> ClangArgv = {"-Xclang", "-emit-llvm-only"};
   auto CB = clang::IncrementalCompilerBuilder();
   CB.SetCompilerArgs(ClangArgv);

From 98fa0f6981f33b7d8f5aa38babc1e71bc0209de8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 May 2024 20:40:58 +0200
Subject: [PATCH 016/230] DAG: Handle vector splitting for
 fminnum_ieee/fmaxnum_ieee

Avoids regression in future commit which starts producing
illegal instances.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 14e8708fd3f38f..361416edb554ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1174,8 +1174,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FADD: case ISD::VP_FADD:
   case ISD::FSUB: case ISD::VP_FSUB:
   case ISD::FMUL: case ISD::VP_FMUL:
-  case ISD::FMINNUM: case ISD::VP_FMINNUM:
-  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:

From bbca20f0b1ab7c6ea36a84e88a6abb07f94ca80b Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Tue, 28 May 2024 23:04:12 +0200
Subject: [PATCH 017/230] [Clang][NFC] remove CHAR_PUNCT duplication introduced
 by #93216 (#93605)

---
 clang/include/clang/Basic/CharInfo.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 4d90528f7992e3..d71857e8e5dcc3 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -151,8 +151,7 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) {
 /// Note that '_' is both a punctuation character and an identifier character!
 LLVM_READONLY inline bool isPunctuation(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] &
-          (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT | CHAR_PUNCT)) != 0;
+  return (InfoTable[c] & (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT)) != 0;
 }
 
 /// Return true if this character is an ASCII printable character; that is, a

From df542e1ed82bd4e5a9e345d3a3ae63a76893a0cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 28 May 2024 23:18:45 +0200
Subject: [PATCH 018/230] Fix build: [clang-repl] Even more tests create the
 Interpreter and must check host JIT support (#84758)

fea7399e97b73a3209fcbe3338d412069769a637 had removed the unused function that was still there when I tested.
---
 clang/unittests/Interpreter/IncrementalProcessingTest.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index 9a99ff6262fa3c..732753f11306e6 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -56,6 +56,14 @@ const Function *getGlobalInit(llvm::Module *M) {
   return nullptr;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
 #ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
 TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) {
 #else

From ed4227aad37f2c4adf307b63050fb9aee52b07f8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 14:37:15 -0700
Subject: [PATCH 019/230] [SCEV] Add tests for symbolic max BTC requiring
 predicates.

Add extra tests for https://github.com/llvm/llvm-project/pull/93498.
---
 ...cated-symbolic-max-backedge-taken-count.ll | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll

diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
new file mode 100644
index 00000000000000..d40416359b65c6
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+; %i and %i + 1 can overflow.
+define void @test1(i64 %x, ptr %a, ptr %b) {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Determining loop execution counts for: @test1
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32 [ 0, %entry ], [ %add, %latch ]
+  %add = add i32 %i.010, 1
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add2 = add nsw i32 %ld, 1
+  %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
+  store i32 %add2, ptr %arrayidx4, align 4
+  %conv = zext i32 %add to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; %i can overflow.
+;
+; We need to check that i doesn't wrap, but we don't need a run-time alias
+; check. We also need an extra no-wrap check to get the backedge taken count.
+define void @test2(i64 %x, ptr %a) {
+; CHECK-LABEL: 'test2'
+; CHECK-NEXT:  Determining loop execution counts for: @test2
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32  [ 0, %entry ], [ %inc, %latch ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %conv11
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add = add nsw i32 %ld, 1
+  store i32 %add, ptr %arrayidx, align 4
+  %inc = add i32 %i.010, 1
+  %conv = zext i32 %inc to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}

From e3f74d4589e29279e9f543b58577a2ece102dc6f Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 28 May 2024 14:25:13 -0700
Subject: [PATCH 020/230] [OpenACC] Correct serialization of certain clause
 sub-expressions

For some reason I was using writeStmtRef when I meant writeStmt, so this
corrects that.
---
 clang/lib/Serialization/ASTWriter.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index dd548fabfd9551..e830c4026ea78f 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7835,7 +7835,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::If: {
     const auto *IC = cast<OpenACCIfClause>(C);
     writeSourceLocation(IC->getLParenLoc());
-    writeStmtRef(IC->getConditionExpr());
+    AddStmt(const_cast<Expr*>(IC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::Self: {
@@ -7843,7 +7843,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(SC->getLParenLoc());
     writeBool(SC->hasConditionExpr());
     if (SC->hasConditionExpr())
-      writeStmtRef(SC->getConditionExpr());
+      AddStmt(const_cast<Expr*>(SC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::NumGangs: {
@@ -7857,13 +7857,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::NumWorkers: {
     const auto *NWC = cast<OpenACCNumWorkersClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::VectorLength: {
     const auto *NWC = cast<OpenACCVectorLengthClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Private: {
@@ -7942,15 +7942,15 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(AC->getLParenLoc());
     writeBool(AC->hasIntExpr());
     if (AC->hasIntExpr())
-      writeStmtRef(AC->getIntExpr());
+      AddStmt(const_cast<Expr*>(AC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Wait: {
     const auto *WC = cast<OpenACCWaitClause>(C);
     writeSourceLocation(WC->getLParenLoc());
     writeBool(WC->getDevNumExpr());
-    if (const Expr *DNE = WC->getDevNumExpr())
-      writeStmtRef(DNE);
+    if (Expr *DNE = WC->getDevNumExpr())
+      AddStmt(DNE);
     writeSourceLocation(WC->getQueuesLoc());
 
     writeOpenACCIntExprList(WC->getQueueIdExprs());

From 060b3023e198d197b47c652f19af5f7dea3a22cc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 14:49:57 -0700
Subject: [PATCH 021/230] [RISCV] Move TRUNCATE_VECTOR_VL combine into a helper
 function. NFC (#93574)

I plan to add other combines on TRUNCATE_VECTOR_VL.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 103 ++++++++++----------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c826892c1668ec..5fc613c1b2a140 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16087,6 +16087,57 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask,
   return true;
 }
 
+static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
+  // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
+  // This would be benefit for the cases where X and Y are both the same value
+  // type of low precision vectors. Since the truncate would be lowered into
+  // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
+  // restriction, such pattern would be expanded into a series of "vsetvli"
+  // and "vnsrl" instructions later to reach this point.
+  auto IsTruncNode = [](SDValue V) {
+    if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+      return false;
+    SDValue VL = V.getOperand(2);
+    auto *C = dyn_cast<ConstantSDNode>(VL);
+    // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
+    bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
+                           (isa<RegisterSDNode>(VL) &&
+                            cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
+    return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET;
+  };
+
+  SDValue Op = N->getOperand(0);
+
+  // We need to first find the inner level of TRUNCATE_VECTOR_VL node
+  // to distinguish such pattern.
+  while (IsTruncNode(Op)) {
+    if (!Op.hasOneUse())
+      return SDValue();
+    Op = Op.getOperand(0);
+  }
+
+  if (Op.getOpcode() != ISD::SRA || !Op.hasOneUse())
+    return SDValue();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  if (N0.getOpcode() != ISD::SIGN_EXTEND || !N0.hasOneUse() ||
+      N1.getOpcode() != ISD::ZERO_EXTEND || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+  if (!N00.getValueType().isVector() ||
+      N00.getValueType() != N10.getValueType() ||
+      N->getValueType(0) != N10.getValueType())
+    return SDValue();
+
+  unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
+  SDValue SMin =
+      DAG.getNode(ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
+                  DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
+  return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
+}
 
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
@@ -16304,56 +16355,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     return SDValue();
-  case RISCVISD::TRUNCATE_VECTOR_VL: {
-    // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
-    // This would be benefit for the cases where X and Y are both the same value
-    // type of low precision vectors. Since the truncate would be lowered into
-    // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
-    // restriction, such pattern would be expanded into a series of "vsetvli"
-    // and "vnsrl" instructions later to reach this point.
-    auto IsTruncNode = [](SDValue V) {
-      if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
-        return false;
-      SDValue VL = V.getOperand(2);
-      auto *C = dyn_cast<ConstantSDNode>(VL);
-      // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
-      bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
-                             (isa<RegisterSDNode>(VL) &&
-                              cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
-      return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-             IsVLMAXForVMSET;
-    };
-
-    SDValue Op = N->getOperand(0);
-
-    // We need to first find the inner level of TRUNCATE_VECTOR_VL node
-    // to distinguish such pattern.
-    while (IsTruncNode(Op)) {
-      if (!Op.hasOneUse())
-        return SDValue();
-      Op = Op.getOperand(0);
-    }
-
-    if (Op.getOpcode() == ISD::SRA && Op.hasOneUse()) {
-      SDValue N0 = Op.getOperand(0);
-      SDValue N1 = Op.getOperand(1);
-      if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
-          N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) {
-        SDValue N00 = N0.getOperand(0);
-        SDValue N10 = N1.getOperand(0);
-        if (N00.getValueType().isVector() &&
-            N00.getValueType() == N10.getValueType() &&
-            N->getValueType(0) == N10.getValueType()) {
-          unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
-          SDValue SMin = DAG.getNode(
-              ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
-              DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
-          return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
-        }
-      }
-    }
-    break;
-  }
+  case RISCVISD::TRUNCATE_VECTOR_VL:
+    return combineTruncOfSraSext(N, DAG);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:

From 00bd2fa1982f3114536323209fffad909463effc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 28 May 2024 14:57:13 -0700
Subject: [PATCH 022/230] [flang][cuda] Add bind c to cudadevice procedures
 (#92822)

This patch adds bind c names to functions and subroutines in cudadevice
so they can be lowered and not hit the intrinsic procedure TODOs.
---
 flang/module/cudadevice.f90                | 16 +++++-----
 flang/test/Lower/CUDA/cuda-device-proc.cuf | 36 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-device-proc.cuf

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index f34820dd10792a..0224ecfdde7c60 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -18,34 +18,34 @@ module cudadevice
   ! Synchronization Functions
 
   interface
-    attributes(device) subroutine syncthreads()
+    attributes(device) subroutine syncthreads() bind(c, name='__syncthreads')
     end subroutine
   end interface
   public :: syncthreads
 
   interface
-    attributes(device) integer function syncthreads_and(value)
+    attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and')
       integer :: value
     end function
   end interface
   public :: syncthreads_and
 
   interface
-    attributes(device) integer function syncthreads_count(value)
+    attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count')
       integer :: value
     end function
   end interface
   public :: syncthreads_count
 
   interface
-    attributes(device) integer function syncthreads_or(value)
+    attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or')
       integer :: value
     end function
   end interface
   public :: syncthreads_or
 
   interface
-    attributes(device) subroutine syncwarp(mask)
+    attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp')
       integer :: mask
     end subroutine
   end interface
@@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask)
   ! Memory Fences
 
   interface
-    attributes(device) subroutine threadfence()
+    attributes(device) subroutine threadfence() bind(c, name='__threadfence')
     end subroutine
   end interface
   public :: threadfence
 
   interface
-    attributes(device) subroutine threadfence_block()
+    attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block')
     end subroutine
   end interface
   public :: threadfence_block
 
   interface
-    attributes(device) subroutine threadfence_system()
+    attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system')
     end subroutine
   end interface
   public :: threadfence_system
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
new file mode 100644
index 00000000000000..0c71ea6efcd632
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran procedures available in cudadevice module
+
+attributes(global) subroutine devsub()
+  implicit none
+  integer :: ret
+
+  call syncthreads()
+  call syncwarp(1)
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+  ret = syncthreads_and(1)
+  ret = syncthreads_count(1)
+  ret = syncthreads_or(1)
+end
+
+! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: fir.call @__syncthreads()
+! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK: fir.call @__threadfence()
+! CHECK: fir.call @__threadfence_block()
+! CHECK: fir.call @__threadfence_system()
+! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+
+! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads"}
+! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp"}
+! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence"}
+! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_block"}
+! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_system"}
+! CHECK: func.func private @__syncthreads_and(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_and"}
+! CHECK: func.func private @__syncthreads_count(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_count"}
+! CHECK: func.func private @__syncthreads_or(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_or"}

From 2d00c6fe06b6d709b4ab3d6b253df304c04e0c1f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 15:05:23 -0700
Subject: [PATCH 023/230] [RISCV] Add a rematerializable pseudo instruction for
 LUI+ADDI for global addresses. (#93352)

This allows register allocation to rematerialize these instead of
spilling and reloading. We need to make it a single instruction due to
limitations in rematerialization.

This pseudo is expanded to an LUI+ADDI pair between regalloc and post RA
scheduling.

This improves the dynamic instruction count on 531.deepsjeng_r from
spec2017 by 3.2% for the train dataset. 500.perlbench and 502.gcc see a
1% improvement. There are couple regressions, but they are 0.1% or
smaller.

AArch64 has similar pseudo instructions like MOVaddr
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |  20 ++
 .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp |  35 ++-
 .../RISCV/RISCVPostRAExpandPseudoInsts.cpp    |  23 ++
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    |  22 +-
 .../CodeGen/RISCV/ctz_zero_return_test.ll     |   8 +-
 .../early-clobber-tied-def-subreg-liveness.ll |  14 +-
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll |   4 +-
 llvm/test/CodeGen/RISCV/rv32xtheadbb.ll       |   4 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   4 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     |  40 +--
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |   4 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 275 +++++++++---------
 .../RISCV/rvv/fixed-vectors-mask-buildvec.ll  |  20 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |  16 +-
 .../rvv/fixed-vectors-shuffle-reverse.ll      |  80 ++---
 .../RISCV/rvv/fixed-vectors-stepvector.ll     |  10 +-
 .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll |  50 ++--
 llvm/test/CodeGen/RISCV/tail-calls.ll         |   8 +-
 llvm/test/CodeGen/RISCV/unroll-loop-cse.ll    |  32 +-
 19 files changed, 358 insertions(+), 311 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ce50fe6e2cbb02..a1b078910e29c9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12),
 
 /// HI and ADD_LO address nodes.
 
+// Pseudo for a rematerializable LUI+ADDI sequence for loading an address.
+// It will be expanded after register allocation.
+// FIXME: The scheduling information does not reflect the multiple instructions.
+let Size = 8, isReMaterializable = 1 in
+def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>,
+                    Sched<[WriteIALU]>;
+
+def riscv_hi_oneuse : unop_oneuse<riscv_hi>;
+def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo),
+                         (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>;
+
+def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo),
+          (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>;
+def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo),
+          (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>;
+def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo),
+          (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>;
+def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo),
+          (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>;
+
 def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
 def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
 def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 410989177a8b9c..fecc83a821f420 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
 //    3) The offset value in the Global Address or Constant Pool is 0.
 bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
                                              MachineInstr *&Lo) {
-  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC)
+  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC &&
+      Hi.getOpcode() != RISCV::PseudoMovAddr)
     return false;
 
   const MachineOperand &HiOp1 = Hi.getOperand(1);
@@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
       HiOp1.getOffset() != 0)
     return false;
 
-  Register HiDestReg = Hi.getOperand(0).getReg();
-  if (!MRI->hasOneUse(HiDestReg))
-    return false;
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    // Most of the code should handle it correctly without modification by
+    // setting Lo and Hi both point to PseudoMovAddr
+    Lo = &Hi;
+  } else {
+    Register HiDestReg = Hi.getOperand(0).getReg();
+    if (!MRI->hasOneUse(HiDestReg))
+      return false;
 
-  Lo = &*MRI->use_instr_begin(HiDestReg);
-  if (Lo->getOpcode() != RISCV::ADDI)
-    return false;
+    Lo = &*MRI->use_instr_begin(HiDestReg);
+    if (Lo->getOpcode() != RISCV::ADDI)
+      return false;
+  }
 
   const MachineOperand &LoOp2 = Lo->getOperand(2);
-  if (Hi.getOpcode() == RISCV::LUI) {
+  if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) {
     if (LoOp2.getTargetFlags() != RISCVII::MO_LO ||
         !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) ||
         LoOp2.getOffset() != 0)
@@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
 
   Hi.getOperand(1).setOffset(NewOffset);
   MachineOperand &ImmOp = Lo.getOperand(2);
+  // Expand PseudoMovAddr into LUI
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    auto *TII = ST->getInstrInfo();
+    Hi.setDesc(TII->get(RISCV::LUI));
+    Hi.removeOperand(2);
+  }
+
   if (Hi.getOpcode() != RISCV::AUIPC)
     ImmOp.setOffset(NewOffset);
 
@@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
     }
   }
 
+  // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from
+  // being erased
+  if (&Lo == &Hi)
+    return true;
+
   MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
   Lo.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 52f2ce27164d6e..b7b0c47c084c64 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
 };
 
 char RISCVPostRAExpandPseudo::ID = 0;
@@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case RISCV::PseudoMovImm:
     return expandMovImm(MBB, MBBI);
+  case RISCV::PseudoMovAddr:
+    return expandMovAddr(MBB, MBBI);
   default:
     return false;
   }
@@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
   return true;
 }
 
+bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  Register DstReg = MBBI->getOperand(0).getReg();
+  bool DstIsDead = MBBI->getOperand(0).isDead();
+  bool Renamable = MBBI->getOperand(0).isRenamable();
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI))
+      .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(1));
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) |
+                          getRenamableRegState(Renamable))
+      .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(2));
+  MBBI->eraseFromParent();
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32",
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 549d531e829ea5..a90c244437a033 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
@@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-LABEL: test_cttz_i64:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 30667
-; RV32M-NEXT:    addi a2, a2, 1329
-; RV32M-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV32M-NEXT:    addi a3, a3, %lo(.LCPI3_0)
+; RV32M-NEXT:    addi a3, a2, 1329
+; RV32M-NEXT:    lui a2, %hi(.LCPI3_0)
+; RV32M-NEXT:    addi a2, a2, %lo(.LCPI3_0)
 ; RV32M-NEXT:    bnez a1, .LBB3_3
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    li a1, 32
@@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:  .LBB3_2:
 ; RV32M-NEXT:    neg a1, a0
 ; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    mul a0, a0, a2
+; RV32M-NEXT:    mul a0, a0, a3
 ; RV32M-NEXT:    srli a0, a0, 27
-; RV32M-NEXT:    add a0, a3, a0
+; RV32M-NEXT:    add a0, a2, a0
 ; RV32M-NEXT:    lbu a0, 0(a0)
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
 ; RV32M-NEXT:  .LBB3_3:
 ; RV32M-NEXT:    neg a4, a1
 ; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    mul a1, a1, a2
+; RV32M-NEXT:    mul a1, a1, a3
 ; RV32M-NEXT:    srli a1, a1, 27
-; RV32M-NEXT:    add a1, a3, a1
+; RV32M-NEXT:    add a1, a2, a1
 ; RV32M-NEXT:    lbu a1, 0(a1)
 ; RV32M-NEXT:    bnez a0, .LBB3_2
 ; RV32M-NEXT:  .LBB3_4:
@@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI7_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI7_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI7_0)
 ; RV32I-NEXT:    neg a0, s1
 ; RV32I-NEXT:    and a0, s1, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 9ae30e646fdbf7..fe6e20d852d590 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV32I-NEXT:    addi s3, a0, %lo(.LCPI0_0)
+; RV32I-NEXT:    lui s3, %hi(.LCPI0_0)
+; RV32I-NEXT:    addi s3, s3, %lo(.LCPI0_0)
 ; RV32I-NEXT:    neg a0, s4
 ; RV32I-NEXT:    and a0, s4, a0
 ; RV32I-NEXT:    mv a1, s1
@@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI6_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI6_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI6_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index eb6ac985287a10..478d2eae9dca2c 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -24,31 +24,31 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_48)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_46)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_46)
-; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_45)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_45)
-; CHECK-NEXT:    vle16.v v14, (a0)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs2r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_40)
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 3c2e84689c979c..62b1549a5d58ad 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 {
 define dso_local i64 @load_ga_8() nounwind {
 ; RV32I-LABEL: load_ga_8:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lui a0, %hi(ga_8)
-; RV32I-NEXT:    addi a1, a0, %lo(ga_8)
+; RV32I-NEXT:    lui a1, %hi(ga_8)
+; RV32I-NEXT:    addi a1, a1, %lo(ga_8)
 ; RV32I-NEXT:    lw a0, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index b45ab135fa1c7c..197366e7e05fe8 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 7e6c3f9c87d277..f25aa0de89da88 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 9cb3991f31f94d..08b310213d16e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v24, v16
+; CHECK-NEXT:    vsaddu.vx v16, v24, a1
+; CHECK-NEXT:    vmsltu.vx v9, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 2
+; CHECK-NEXT:    vslideup.vi v0, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v10, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v8, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
@@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v9
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
 ; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vle8.v v11, (a0)
@@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vmsltu.vx v11, v16, a2
 ; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
 ; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
 ; CHECK-NEXT:    vle8.v v13, (a0)
@@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v13, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vslideup.vi v10, v9, 4
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vslideup.vi v10, v11, 6
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v12, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v13, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 6
+; CHECK-NEXT:    vslideup.vi v0, v8, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 8
+; CHECK-NEXT:    vslideup.vi v0, v10, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 79c36a629465d9..f4d7074c7f6b27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a1, %hi(.LCPI184_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV64-NEXT:    vle64.v v10, (a1)
+; RV64-NEXT:    vmulhu.vv v10, v8, v10
+; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    slli a1, a1, 63
 ; RV64-NEXT:    vmv.s.x v12, a1
@@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 2
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmulhu.vv v10, v8, v10
-; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    vmulhu.vv v8, v8, v14
 ; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    lui a1, 12320
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 178a920169ad96..bc3e135a588a6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 82
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 57
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 6
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -177,26 +176,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 41
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v1, a4
+; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 6
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vmv1r.v v3, v0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 45
+; RV32-NEXT:    li a5, 44
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -206,8 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 5
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    slli a4, a4, 5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -216,21 +214,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 25
+; RV32-NEXT:    li a6, 24
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 73
+; RV32-NEXT:    li a4, 72
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -238,27 +236,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -266,259 +263,257 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
+; RV32-NEXT:    vmv1r.v v8, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v1, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_4)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    vle16.v v16, (a3)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_4)
+; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v4
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v24
+; RV32-NEXT:    vmv.v.v v12, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v16, v8, 6, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 960
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV32-NEXT:    vmv1r.v v3, v8
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v28, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v4, v0, v8
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v4, v8, 4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v24, v16
+; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 6
+; RV32-NEXT:    vslideup.vi v8, v16, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    vle16.v v12, (a3)
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 2
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_15)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_15)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a3)
 ; RV32-NEXT:    vle16.v v8, (a1)
@@ -526,27 +521,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -554,7 +548,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -562,12 +556,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -576,31 +570,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 13
+; RV32-NEXT:    li a2, 12
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 57
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    addi a1, a0, 320
@@ -614,21 +605,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 82
+; RV32-NEXT:    li a1, 80
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index 17483151869365..7608349ef7aeff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -549,20 +549,20 @@ define <128 x i1> @buildvec_mask_v128i1() {
 define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; CHECK-LABEL: buildvec_mask_optsize_v128i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_optsize_v128i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    lui a0, %hi(.LCPI21_0)
-; ZVE32F-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVE32F-NEXT:    li a1, 128
-; ZVE32F-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; ZVE32F-NEXT:    vlm.v v0, (a0)
+; ZVE32F-NEXT:    li a0, 128
+; ZVE32F-NEXT:    lui a1, %hi(.LCPI21_0)
+; ZVE32F-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; ZVE32F-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVE32F-NEXT:    vlm.v v0, (a1)
 ; ZVE32F-NEXT:    ret
   ret <128 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index db0969c85a8e24..69341981288b91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -13327,22 +13327,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV32-LABEL: mgather_shuffle_vrgather:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v9, (a0)
-; RV32-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV32-NEXT:    vle16.v v9, (a1)
 ; RV32-NEXT:    vle16.v v10, (a0)
-; RV32-NEXT:    vrgather.vv v8, v9, v10
+; RV32-NEXT:    vrgather.vv v8, v10, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_shuffle_vrgather:
 ; RV64V:       # %bb.0:
+; RV64V-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV64V-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vle16.v v9, (a0)
-; RV64V-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV64V-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV64V-NEXT:    vle16.v v9, (a1)
 ; RV64V-NEXT:    vle16.v v10, (a0)
-; RV64V-NEXT:    vrgather.vv v8, v9, v10
+; RV64V-NEXT:    vrgather.vv v8, v10, v9
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index d70ed2fb0e2665..4b1f0beb487008 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -228,11 +228,11 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: reverse_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI12_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI12_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI12_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -243,11 +243,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-LABEL: reverse_v64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI13_0)
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    lui a1, %hi(.LCPI13_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI13_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -323,11 +323,11 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-LABEL: reverse_v32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI19_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -520,11 +520,11 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI34_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI34_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -820,33 +820,33 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_v12i64:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
-; RV32-BITS-UNKNOWN-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    li a1, 32
-; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-UNKNOWN-NEXT:    li a0, 32
+; RV32-BITS-UNKNOWN-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV32-BITS-256-LABEL: reverse_v12i64:
 ; RV32-BITS-256:       # %bb.0:
-; RV32-BITS-256-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-256-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-256-NEXT:    li a1, 32
-; RV32-BITS-256-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-256-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-256-NEXT:    li a0, 32
+; RV32-BITS-256-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-256-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-256-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-256-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-256-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-256-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-256-NEXT:    ret
 ;
 ; RV32-BITS-512-LABEL: reverse_v12i64:
 ; RV32-BITS-512:       # %bb.0:
-; RV32-BITS-512-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-512-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-512-NEXT:    li a1, 32
-; RV32-BITS-512-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-512-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-512-NEXT:    li a0, 32
+; RV32-BITS-512-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-512-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-512-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-512-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-512-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-512-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-512-NEXT:    ret
@@ -883,11 +883,11 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ;
 ; RV32-ZVBB-LABEL: reverse_v12i64:
 ; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-ZVBB-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-ZVBB-NEXT:    li a1, 32
-; RV32-ZVBB-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v24, (a0)
+; RV32-ZVBB-NEXT:    li a0, 32
+; RV32-ZVBB-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-ZVBB-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-ZVBB-NEXT:    vle16.v v24, (a1)
 ; RV32-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV32-ZVBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
index 0161ac4bc338db..e2580c132f65e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
@@ -225,11 +225,11 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64()
 define <16 x i64> @stepvector_v16i64() {
 ; RV32-LABEL: stepvector_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI16_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI16_0)
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    lui a1, %hi(.LCPI16_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_0)
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a1)
 ; RV32-NEXT:    vsext.vf4 v8, v16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 6e327457bebffc..368f454fa5fda1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -106,11 +106,11 @@ define <16 x i8> @v16i8(<16 x i8> %a) {
 define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v16i8_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI7_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI7_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vmv1r.v v14, v9
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vid.v v8
@@ -230,11 +230,11 @@ define <16 x i16> @v16i16(<16 x i16> %a) {
 define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: v16i16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI15_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI15_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI15_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI15_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vmv2r.v v20, v10
 ; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16
@@ -363,11 +363,11 @@ define <16 x i32> @v16i32(<16 x i32> %a) {
 define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: v16i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI23_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI23_0)
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vle16.v v20, (a1)
 ; CHECK-NEXT:    vmv4r.v v24, v12
 ; CHECK-NEXT:    vmv4r.v v16, v8
 ; CHECK-NEXT:    vrgatherei16.vv v8, v16, v20
@@ -548,11 +548,11 @@ define <16 x half> @v16f16(<16 x half> %a) {
 define <32 x half> @v16f16_2(<16 x half> %a) {
 ; CHECK-LABEL: v16f16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI35_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI35_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -719,11 +719,11 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) {
 define <32 x i8> @v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI46_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 87d69bfad38c2b..d3e495bb723ad8 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -56,12 +56,12 @@ define void @caller_indirect_tail(i32 %a) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    beqz a0, .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %entry
-; CHECK-NEXT:    lui a0, %hi(callee_indirect2)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect2)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect2)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect2)
 ; CHECK-NEXT:    jr t1
 ; CHECK-NEXT:  .LBB3_2:
-; CHECK-NEXT:    lui a0, %hi(callee_indirect1)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect1)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect1)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect1)
 ; CHECK-NEXT:    jr t1
 
 
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 2fd4572d234567..65307363048376 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -10,36 +10,30 @@
 define signext i32 @unroll_loop_cse() {
 ; CHECK-LABEL: unroll_loop_cse:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    lw a3, %lo(x)(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    lw a4, %lo(check)(a2)
+; CHECK-NEXT:    lui a0, %hi(x)
+; CHECK-NEXT:    lw a1, %lo(x)(a0)
+; CHECK-NEXT:    lui a0, %hi(check)
+; CHECK-NEXT:    lw a2, %lo(check)(a0)
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    bne a3, a4, .LBB0_6
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a1, 4(a1)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
-; CHECK-NEXT:    lw a2, 4(a2)
 ; CHECK-NEXT:    bne a1, a2, .LBB0_6
-; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a1, %hi(x)
 ; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a3, 8(a1)
+; CHECK-NEXT:    lw a3, 4(a1)
 ; CHECK-NEXT:    lui a2, %hi(check)
 ; CHECK-NEXT:    addi a2, a2, %lo(check)
+; CHECK-NEXT:    lw a4, 4(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    lw a3, 8(a1)
 ; CHECK-NEXT:    lw a4, 8(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    lw a1, 12(a1)
-; CHECK-NEXT:    lw a2, 12(a2)
-; CHECK-NEXT:    bne a1, a2, .LBB0_6
+; CHECK-NEXT:    lw a3, 12(a1)
+; CHECK-NEXT:    lw a4, 12(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    addi a1, a1, %lo(x)
 ; CHECK-NEXT:    lw a3, 16(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
 ; CHECK-NEXT:    lw a4, 16(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.5:

From 765206e050453018e861637a08a4520f29238074 Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Tue, 28 May 2024 15:06:11 -0700
Subject: [PATCH 024/230] [CodeGen] Hidden visibility for prof version var
 (#93496)

This patch adds hidden visibility to the variable
that is used by the single byte counters mode in
source-based code coverage.
---
 clang/lib/CodeGen/CodeGenPGO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 76704c4d7be4a4..db8e6f55302adc 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1340,7 +1340,7 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) {
                                         llvm::APInt(64, ProfileVersion)),
         VarName);
 
-    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility);
+    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility);
     llvm::Triple TT(M.getTargetTriple());
     if (TT.supportsCOMDAT()) {
       IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);

From 067b4ccb4b5ab93ac2dc2243248a8934fa1f7ce3 Mon Sep 17 00:00:00 2001
From: Eric <eric@efcs.ca>
Date: Tue, 28 May 2024 15:19:04 -0700
Subject: [PATCH 025/230] Upstream libc++ buildbot restarter. (#93582)

I've been running a cronjob on my local machine to restart preempted
libc++ CI runs. This is bad and brittle. This upstreams a much better
version of the restarter.

It works by matching on check run annotations looking for mention
of the machine being shutdown.

If there are both preempted jobs and failing jobs, we don't restart
the workflow. Maybe we should change that?
---
 .../restart-preempted-libcxx-jobs.yaml        | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 .github/workflows/restart-preempted-libcxx-jobs.yaml

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
new file mode 100644
index 00000000000000..a71f2084182e5e
--- /dev/null
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -0,0 +1,109 @@
+name: Restart Preempted Libc++ Workflow
+
+# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
+# This workflow identifies when a workflow run was canceled due to the VM being preempted,
+# and restarts the workflow run.
+
+# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
+# which should contain the message "The runner has received a shutdown signal."
+
+# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
+
+on:
+  workflow_run:
+    workflows:
+      - "Build and Test libc\+\+"
+    types:
+      - failure
+      - canceled
+
+permissions:
+  contents: read
+
+jobs:
+  restart:
+    if: github.repository_owner == 'llvm'
+    name: "Restart Job"
+    permissions:
+      statuses: read
+      checks: read
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Restart Job"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          script: |
+            const failure_regex = /Process completed with exit code 1./
+            const preemption_regex = /The runner has received a shutdown signal/
+            
+            console.log('Listing check runs for suite')
+            const check_suites = await github.rest.checks.listForSuite({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              check_suite_id: context.payload.workflow_run.check_suite_id
+            })
+
+            check_run_ids = [];
+            for (check_run of check_suites.data.check_runs) {
+              console.log('Checking check run: ' + check_run.id);
+              console.log(check_run);
+              if (check_run.status != 'completed') {
+                console.log('Check run was not completed. Skipping.');
+                continue;
+              }
+              if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
+                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
+                continue;
+              }
+              check_run_ids.push(check_run.id);
+            }
+            
+            has_preempted_job = false;
+
+            for (check_run_id of check_run_ids) {
+              console.log('Listing annotations for check run: ' + check_run_id);
+                 
+              annotations = await github.rest.checks.listAnnotations({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: check_run_id
+              })
+              
+              console.log(annotations);
+              for (annotation of annotations.data) {
+                if (annotation.annotation_level != 'failure') {
+                  continue;
+                }
+                
+                const preemption_match = annotation.message.match(preemption_regex);
+              
+                if (preemption_match != null) {
+                  console.log('Found preemption message: ' + annotation.message);
+                  has_preempted_job = true;
+                }
+                
+                const failure_match = annotation.message.match(failure_regex);
+                if (failure_match != null) {
+                  // We only want to restart the workflow if all of the failures were due to preemption.
+                  // We don't want to restart the workflow if there were other failures.
+                  console.log('Choosing not to rerun workflow because we found a non-preemption failure');
+                  console.log('Failure message: ' + annotation.message);
+                  return;
+                }
+              }
+            } 
+             
+            if (!has_preempted_job) {
+              console.log('No preempted jobs found. Not restarting workflow.');
+              return;
+            }
+            
+            console.log("Restarted workflow: " + context.payload.workflow_run.id);
+            await github.rest.actions.reRunWorkflowFailedJobs({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id
+              })
+            
+        

From b9cdea66b62e2eb91814ef7c57ea01aa27440e72 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 18:23:14 -0400
Subject: [PATCH 026/230] Attempt to fix issue with plus sign in libc++
 workflow name

---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index a71f2084182e5e..5682b0a4f52c3d 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -12,7 +12,7 @@ name: Restart Preempted Libc++ Workflow
 on:
   workflow_run:
     workflows:
-      - "Build and Test libc\+\+"
+      - Build and Test libc\+\+
     types:
       - failure
       - canceled

From 6aeea700df6f3f8db9e6a79be4aa593c6fcc7d18 Mon Sep 17 00:00:00 2001
From: Spenser Bauman <sbauman@mathworks.com>
Date: Tue, 28 May 2024 18:29:17 -0400
Subject: [PATCH 027/230] [mlir][dataflow] Fix for integer range analysis
 propagation bug (#93199)

Integer range analysis will not update the range of an operation when
any of the inferred input lattices are uninitialized. In the current
behavior, all lattice values for non integer types are uninitialized.

For operations like arith.cmpf

```mlir
%3 = arith.cmpf ugt, %arg0, %arg1 : f32
```

that will result in the range of the output also being uninitialized,
and so on for any consumer of the arith.cmpf result. When control-flow
ops are involved, the lack of propagation results in incorrect ranges,
as the back edges for loop carried values are not properly joined with
the definitions from the body region.

For example, an scf.while loop whose body region produces a value that
is in a dataflow relationship with some floating-point values through an
arith.cmpf operation:

```mlir
func.func @test_bad_range(%arg0: f32, %arg1: f32) -> (index, index) {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index

  %3 = arith.cmpf ugt, %arg0, %arg1 : f32

  %1:2 = scf.while (%arg2 = %c0, %arg3 = %c0) : (index, index) -> (index, index) {
    %2 = arith.cmpi ult, %arg2, %c4 : index
    scf.condition(%2) %arg2, %arg3 : index, index
  } do {
  ^bb0(%arg2: index, %arg3: index):
    %4 = arith.select %3, %arg3, %arg3 : index
    %5 = arith.addi %arg2, %c1 : index
    scf.yield %5, %4 : index, index
  }

  return %1#0, %1#1 : index, index
}
```

The existing behavior results in the control condition %2 being
optimized to true, turning the while loop into an infinite loop. The
update to %arg2 through the body region is never factored into the range
calculation, as the ranges for the body ops all test as uninitialized.

This change causes all values initialized with setToEntryState to be set
to some initialized range, even if the values are not integers.

---------

Co-authored-by: Spenser Bauman <sabauma@fastmail>
---
 .../Analysis/DataFlow/IntegerRangeAnalysis.h  | 45 -----------
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 16 ++--
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 12 +--
 .../include/mlir/Dialect/Index/IR/IndexOps.td |  2 +-
 .../mlir/Interfaces/InferIntRangeInterface.h  | 75 ++++++++++++++++++-
 .../mlir/Interfaces/InferIntRangeInterface.td | 46 +++++++++---
 .../Interfaces/Utils/InferIntRangeCommon.h    |  8 +-
 .../DataFlow/IntegerRangeAnalysis.cpp         | 51 ++++---------
 .../Arith/IR/InferIntRangeInterfaceImpls.cpp  | 18 +++--
 .../lib/Interfaces/InferIntRangeInterface.cpp | 48 ++++++++++++
 .../Interfaces/Utils/InferIntRangeCommon.cpp  |  2 +-
 .../Dialect/Arith/int-range-interface.mlir    | 19 +++++
 mlir/test/lib/Dialect/Test/TestOps.td         |  9 ++-
 13 files changed, 230 insertions(+), 121 deletions(-)

diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
index 8bd7cf880c6afb..191c023fb642cb 100644
--- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
@@ -24,51 +24,6 @@
 namespace mlir {
 namespace dataflow {
 
-/// This lattice value represents the integer range of an SSA value.
-class IntegerValueRange {
-public:
-  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
-  /// range that is used to mark the value as unable to be analyzed further,
-  /// where `t` is the type of `value`.
-  static IntegerValueRange getMaxRange(Value value);
-
-  /// Create an integer value range lattice value.
-  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
-      : value(std::move(value)) {}
-
-  /// Whether the range is uninitialized. This happens when the state hasn't
-  /// been set during the analysis.
-  bool isUninitialized() const { return !value.has_value(); }
-
-  /// Get the known integer value range.
-  const ConstantIntRanges &getValue() const {
-    assert(!isUninitialized());
-    return *value;
-  }
-
-  /// Compare two ranges.
-  bool operator==(const IntegerValueRange &rhs) const {
-    return value == rhs.value;
-  }
-
-  /// Take the union of two ranges.
-  static IntegerValueRange join(const IntegerValueRange &lhs,
-                                const IntegerValueRange &rhs) {
-    if (lhs.isUninitialized())
-      return rhs;
-    if (rhs.isUninitialized())
-      return lhs;
-    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
-  }
-
-  /// Print the integer value range.
-  void print(raw_ostream &os) const { os << value; }
-
-private:
-  /// The known integer value range.
-  std::optional<ConstantIntRanges> value;
-};
-
 /// This lattice element represents the integer value range of an SSA value.
 /// When this lattice is updated, it automatically updates the constant value
 /// of the SSA value (if the range can be narrowed to one).
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index ead52332e8eec3..46248dad3be9e0 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -49,7 +49,7 @@ class Arith_BinaryOp<string mnemonic, list<Trait> traits = []> :
 // Base class for integer binary operations.
 class Arith_IntBinaryOp<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs)>,
     Results<(outs SignlessIntegerLike:$result)>;
 
@@ -107,7 +107,7 @@ class Arith_IToICastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike,
                            SignlessFixedWidthIntegerLike,
                            traits #
-                           [DeclareOpInterfaceMethods<InferIntRangeInterface>]>;
+                           [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>;
 // Cast from an integer type to a floating point type.
 class Arith_IToFCastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike, FloatLike, traits>;
@@ -139,7 +139,7 @@ class Arith_CompareOpOfAnyRank<string mnemonic, list<Trait> traits = []> :
 
 class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
        DeclareOpInterfaceMethods<ArithIntegerOverflowFlagsInterface>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs,
       DefaultValuedAttr<
@@ -159,7 +159,7 @@ def Arith_ConstantOp : Op<Arith_Dialect, "constant",
     [ConstantLike, Pure,
      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
      AllTypesMatch<["value", "result"]>,
-     DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer or floating point constant";
   let description = [{
     The `constant` operation produces an SSA value equal to some integer or
@@ -1327,7 +1327,7 @@ def IndexCastTypeConstraint : TypeConstraint<Or<[
 
 def Arith_IndexCastOp
   : Arith_CastOp<"index_cast", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1346,7 +1346,7 @@ def Arith_IndexCastOp
 
 def Arith_IndexCastUIOp
   : Arith_CastOp<"index_castui", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "unsigned cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1400,7 +1400,7 @@ def Arith_BitcastOp : Arith_CastOp<"bitcast", BitcastTypeConstraint,
 
 def Arith_CmpIOp
   : Arith_CompareOpOfAnyRank<"cmpi",
-                             [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                             [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer comparison operation";
   let description = [{
     The `cmpi` operation is a generic comparison for integer-like types. Its two
@@ -1555,7 +1555,7 @@ class ScalarConditionOrMatchingShape<list<string> names> :
 def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     ScalarConditionOrMatchingShape<["condition", "result"]>,
-    DeclareOpInterfaceMethods<InferIntRangeInterface>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRangesFromOptional"]>,
   ] # ElementwiseMappable.traits> {
   let summary = "select operation";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 1da68ed2176d8f..10719aae5c8b46 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -52,7 +52,7 @@ def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
 class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [
         Pure,
-        DeclareOpInterfaceMethods<InferIntRangeInterface>,
+        DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
     Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
   let assemblyFormat = "$dimension attr-dict";
@@ -144,7 +144,7 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
 }
 
 def GPU_LaneIdOp : GPU_Op<"lane_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let description = [{
     Returns the lane id within the subgroup (warp/wave).
 
@@ -158,7 +158,7 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
 }
 
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the subgroup id, i.e., the index of the current subgroup within the
@@ -190,7 +190,7 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
 
 
 def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of subgroups within a workgroup.
@@ -206,7 +206,7 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
 }
 
 def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of threads within a subgroup.
@@ -687,7 +687,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
 
 def GPU_LaunchOp : GPU_Op<"launch", [
       AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
-      DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
       RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
diff --git a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
index c6079cb8a98c81..a30ae9f739cbc6 100644
--- a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
+++ b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
@@ -25,7 +25,7 @@ include "mlir/IR/OpBase.td"
 /// Base class for Index dialect operations.
 class IndexOp<string mnemonic, list<Trait> traits = []>
     : Op<IndexDialect, mnemonic,
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>] # traits>;
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] # traits>;
 
 //===----------------------------------------------------------------------===//
 // IndexBinaryOp
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
index 05064a72ef02e7..0e107e88f5232f 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
@@ -105,10 +105,83 @@ class ConstantIntRanges {
 
 raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &);
 
+/// This lattice value represents the integer range of an SSA value.
+class IntegerValueRange {
+public:
+  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
+  /// range that is used to mark the value as unable to be analyzed further,
+  /// where `t` is the type of `value`.
+  static IntegerValueRange getMaxRange(Value value);
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {}
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
+      : value(std::move(value)) {}
+
+  /// Whether the range is uninitialized. This happens when the state hasn't
+  /// been set during the analysis.
+  bool isUninitialized() const { return !value.has_value(); }
+
+  /// Get the known integer value range.
+  const ConstantIntRanges &getValue() const {
+    assert(!isUninitialized());
+    return *value;
+  }
+
+  /// Compare two ranges.
+  bool operator==(const IntegerValueRange &rhs) const {
+    return value == rhs.value;
+  }
+
+  /// Compute the least upper bound of two ranges.
+  static IntegerValueRange join(const IntegerValueRange &lhs,
+                                const IntegerValueRange &rhs) {
+    if (lhs.isUninitialized())
+      return rhs;
+    if (rhs.isUninitialized())
+      return lhs;
+    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
+  }
+
+  /// Print the integer value range.
+  void print(raw_ostream &os) const { os << value; }
+
+private:
+  /// The known integer value range.
+  std::optional<ConstantIntRanges> value;
+};
+
+raw_ostream &operator<<(raw_ostream &, const IntegerValueRange &);
+
 /// The type of the `setResultRanges` callback provided to ops implementing
 /// InferIntRangeInterface. It should be called once for each integer result
 /// value and be passed the ConstantIntRanges corresponding to that value.
-using SetIntRangeFn = function_ref<void(Value, const ConstantIntRanges &)>;
+using SetIntRangeFn =
+    llvm::function_ref<void(Value, const ConstantIntRanges &)>;
+
+/// Similar to SetIntRangeFn, but operating on IntegerValueRange lattice values.
+/// This is the `setResultRanges` callback for the IntegerValueRange based
+/// interface method.
+using SetIntLatticeFn =
+    llvm::function_ref<void(Value, const IntegerValueRange &)>;
+
+class InferIntRangeInterface;
+
+namespace intrange::detail {
+/// Default implementation of `inferResultRanges` which dispatches to the
+/// `inferResultRangesFromOptional`.
+void defaultInferResultRanges(InferIntRangeInterface interface,
+                              ArrayRef<IntegerValueRange> argRanges,
+                              SetIntLatticeFn setResultRanges);
+
+/// Default implementation of `inferResultRangesFromOptional` which dispatches
+/// to the `inferResultRanges`.
+void defaultInferResultRangesFromOptional(InferIntRangeInterface interface,
+                                          ArrayRef<ConstantIntRanges> argRanges,
+                                          SetIntRangeFn setResultRanges);
+} // end namespace intrange::detail
 } // end namespace mlir
 
 #include "mlir/Interfaces/InferIntRangeInterface.h.inc"
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
index dbdc526c6f10b6..6ee436ce4d6c2f 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
@@ -28,9 +28,10 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
       Infer the bounds on the results of this op given the bounds on its arguments.
       For each result value or block argument (that isn't a branch argument,
       since the dataflow analysis handles those case), the method should call
-      `setValueRange` with that `Value` as an argument. When `setValueRange`
-      is not called for some value, it will recieve a default value of the mimimum
-      and maximum values for its type (the unbounded range).
+      `setValueRange` with that `Value` as an argument. When implemented,
+      `setValueRange` should be called on all result values for the operation.
+      When operations take non-integer inputs, the
+     `inferResultRangesFromOptional` method should be implemented instead.
 
       When called on an op that also implements the RegionBranchOpInterface
       or BranchOpInterface, this method should not attempt to infer the values
@@ -39,14 +40,39 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
 
       This function will only be called when at least one result of the op is a
       scalar integer value or the op has a region.
+    }],
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRanges",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
+                  "::mlir::SetIntRangeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRangesFromOptional($_op,
+                                                                     argRanges,
+                                                                     setResultRanges);
+    }]>,
+
+    InterfaceMethod<[{
+      Infer the bounds on the results of this op given the lattice representation
+      of the bounds for its arguments. For each result value or block argument
+      (that isn't a branch argument, since the dataflow analysis handles
+      those case), the method should call `setValueRange` with that `Value`
+      as an argument. When implemented, `setValueRange` should be called on
+      all result values for the operation.
 
-      `argRanges` contains one `IntRangeAttrs` for each argument to the op in ODS
-       order. Non-integer arguments will have the an unbounded range of width-0
-       APInts in their `argRanges` element.
+      This method allows for more precise implementations when operations
+      want to reason about inputs which may be undefined during the analysis.
     }],
-    "void", "inferResultRanges", (ins
-      "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
-      "::mlir::SetIntRangeFn":$setResultRanges)
-  >];
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRangesFromOptional",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::IntegerValueRange>":$argRanges,
+                  "::mlir::SetIntLatticeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRanges($_op,
+                                                         argRanges,
+                                                         setResultRanges);
+    }]>
+  ];
 }
 #endif // MLIR_INTERFACES_INFERINTRANGEINTERFACE
diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
index 851bb534bc7ee1..3988a8826498a9 100644
--- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
+++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
@@ -25,7 +25,11 @@ namespace intrange {
 /// abstracted away here to permit writing the function that handles both
 /// 64- and 32-bit index types.
 using InferRangeFn =
-    function_ref<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+    std::function<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+
+/// Function that performs inferrence on an array of `IntegerValueRange`.
+using InferIntegerValueRangeFn =
+    std::function<IntegerValueRange(ArrayRef<IntegerValueRange>)>;
 
 static constexpr unsigned indexMinWidth = 32;
 static constexpr unsigned indexMaxWidth = 64;
@@ -52,7 +56,7 @@ using InferRangeWithOvfFlagsFn =
 ///
 /// The `mode` argument specifies if the unsigned, signed, or both results of
 /// the inference computation should be used when comparing the results.
-ConstantIntRanges inferIndexOp(InferRangeFn inferFn,
+ConstantIntRanges inferIndexOp(const InferRangeFn &inferFn,
                                ArrayRef<ConstantIntRanges> argRanges,
                                CmpMode mode);
 
diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index a82c30717e275b..9721620807a0f0 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -36,17 +36,6 @@
 using namespace mlir;
 using namespace mlir::dataflow;
 
-IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
-  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
-  if (width == 0)
-    return {};
-  APInt umin = APInt::getMinValue(width);
-  APInt umax = APInt::getMaxValue(width);
-  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
-  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
-  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
-}
-
 void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
   Lattice::onUpdate(solver);
 
@@ -72,24 +61,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
 void IntegerRangeAnalysis::visitOperation(
     Operation *op, ArrayRef<const IntegerValueRangeLattice *> operands,
     ArrayRef<IntegerValueRangeLattice *> results) {
-  // If the lattice on any operand is unitialized, bail out.
-  if (llvm::any_of(operands, [](const IntegerValueRangeLattice *lattice) {
-        return lattice->getValue().isUninitialized();
-      })) {
-    return;
-  }
-
   auto inferrable = dyn_cast<InferIntRangeInterface>(op);
   if (!inferrable)
     return setAllToEntryStates(results);
 
   LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-  SmallVector<ConstantIntRanges> argRanges(
-      llvm::map_range(operands, [](const IntegerValueRangeLattice *val) {
-        return val->getValue().getValue();
-      }));
+  auto argRanges = llvm::map_to_vector(
+      operands, [](const IntegerValueRangeLattice *lattice) {
+        return lattice->getValue();
+      });
 
-  auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+  auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
     auto result = dyn_cast<OpResult>(v);
     if (!result)
       return;
@@ -99,7 +81,7 @@ void IntegerRangeAnalysis::visitOperation(
     IntegerValueRangeLattice *lattice = results[result.getResultNumber()];
     IntegerValueRange oldRange = lattice->getValue();
 
-    ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+    ChangeResult changed = lattice->join(attrs);
 
     // Catch loop results with loop variant bounds and conservatively make
     // them [-inf, inf] so we don't circle around infinitely often (because
@@ -116,7 +98,7 @@ void IntegerRangeAnalysis::visitOperation(
     propagateIfChanged(lattice, changed);
   };
 
-  inferrable.inferResultRanges(argRanges, joinCallback);
+  inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
 }
 
 void IntegerRangeAnalysis::visitNonControlFlowArguments(
@@ -124,17 +106,12 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
     ArrayRef<IntegerValueRangeLattice *> argLattices, unsigned firstIndex) {
   if (auto inferrable = dyn_cast<InferIntRangeInterface>(op)) {
     LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-    // If the lattice on any operand is unitialized, bail out.
-    if (llvm::any_of(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().isUninitialized();
-        }))
-      return;
-    SmallVector<ConstantIntRanges> argRanges(
-        llvm::map_range(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().getValue();
-        }));
 
-    auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+    auto argRanges = llvm::map_to_vector(op->getOperands(), [&](Value value) {
+      return getLatticeElementFor(op, value)->getValue();
+    });
+
+    auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
       auto arg = dyn_cast<BlockArgument>(v);
       if (!arg)
         return;
@@ -145,7 +122,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       IntegerValueRangeLattice *lattice = argLattices[arg.getArgNumber()];
       IntegerValueRange oldRange = lattice->getValue();
 
-      ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+      ChangeResult changed = lattice->join(attrs);
 
       // Catch loop results with loop variant bounds and conservatively make
       // them [-inf, inf] so we don't circle around infinitely often (because
@@ -162,7 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       propagateIfChanged(lattice, changed);
     };
 
-    inferrable.inferResultRanges(argRanges, joinCallback);
+    inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
     return;
   }
 
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index fbe2ecab8adcaa..462044417b5fb8 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -295,18 +295,24 @@ void arith::CmpIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-void arith::SelectOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
-                                        SetIntRangeFn setResultRange) {
-  std::optional<APInt> mbCondVal = argRanges[0].getConstantValue();
+void arith::SelectOp::inferResultRangesFromOptional(
+    ArrayRef<IntegerValueRange> argRanges, SetIntLatticeFn setResultRange) {
+  std::optional<APInt> mbCondVal =
+      argRanges[0].isUninitialized()
+          ? std::nullopt
+          : argRanges[0].getValue().getConstantValue();
+
+  const IntegerValueRange &trueCase = argRanges[1];
+  const IntegerValueRange &falseCase = argRanges[2];
 
   if (mbCondVal) {
     if (mbCondVal->isZero())
-      setResultRange(getResult(), argRanges[2]);
+      setResultRange(getResult(), falseCase);
     else
-      setResultRange(getResult(), argRanges[1]);
+      setResultRange(getResult(), trueCase);
     return;
   }
-  setResultRange(getResult(), argRanges[1].rangeUnion(argRanges[2]));
+  setResultRange(getResult(), IntegerValueRange::join(trueCase, falseCase));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
index b3f6c0ee3cc32d..d879b93586899b 100644
--- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp
+++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
@@ -126,3 +126,51 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const ConstantIntRanges &range) {
   return os << "unsigned : [" << range.umin() << ", " << range.umax()
             << "] signed : [" << range.smin() << ", " << range.smax() << "]";
 }
+
+IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
+  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
+  if (width == 0)
+    return {};
+
+  APInt umin = APInt::getMinValue(width);
+  APInt umax = APInt::getMaxValue(width);
+  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
+  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
+  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
+}
+
+raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) {
+  range.print(os);
+  return os;
+}
+
+void mlir::intrange::detail::defaultInferResultRanges(
+    InferIntRangeInterface interface, ArrayRef<IntegerValueRange> argRanges,
+    SetIntLatticeFn setResultRanges) {
+  llvm::SmallVector<ConstantIntRanges> unpacked;
+  unpacked.reserve(argRanges.size());
+
+  for (const IntegerValueRange &range : argRanges) {
+    if (range.isUninitialized())
+      return;
+    unpacked.push_back(range.getValue());
+  }
+
+  interface.inferResultRanges(
+      unpacked,
+      [&setResultRanges](Value value, const ConstantIntRanges &argRanges) {
+        setResultRanges(value, IntegerValueRange{argRanges});
+      });
+}
+
+void mlir::intrange::detail::defaultInferResultRangesFromOptional(
+    InferIntRangeInterface interface, ArrayRef<ConstantIntRanges> argRanges,
+    SetIntRangeFn setResultRanges) {
+  auto ranges = llvm::to_vector_of<IntegerValueRange>(argRanges);
+  interface.inferResultRangesFromOptional(
+      ranges,
+      [&setResultRanges](Value value, const IntegerValueRange &argRanges) {
+        if (!argRanges.isUninitialized())
+          setResultRanges(value, argRanges.getValue());
+      });
+}
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index fe1a67d6287386..5b8d35e7bd5197 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -76,7 +76,7 @@ static ConstantIntRanges minMaxBy(ConstArithFn op, ArrayRef<APInt> lhs,
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferIndexOp(InferRangeFn inferFn,
+mlir::intrange::inferIndexOp(const InferRangeFn &inferFn,
                              ArrayRef<ConstantIntRanges> argRanges,
                              intrange::CmpMode mode) {
   ConstantIntRanges sixtyFour = inferFn(argRanges);
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 5b538197a0c117..60f0ab41afa48d 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -899,3 +899,22 @@ func.func @test_shl_i8_nowrap() -> i8 {
   %2 = test.reflect_bounds %1 : i8
   return %2: i8
 }
+
+/// A test case to ensure that the ranges for unsupported ops are initialized
+/// properly to maxRange, rather than left uninitialized.
+/// In this test case, the previous behavior would leave the ranges for %a and
+/// %b uninitialized, resulting in arith.cmpf's range not being updated, even
+/// though it has an integer valued result.
+
+// CHECK-LABEL: func @test_cmpf_propagates
+// CHECK: test.reflect_bounds {smax = 2 : index, smin = 1 : index, umax = 2 : index, umin = 1 : index}
+func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = arith.cmpf ueq, %a, %b : f32
+  %1 = arith.select %0, %c1, %c2 : index
+  %2 = test.reflect_bounds %1 : index
+  func.return %2 : index
+}
+
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 18324482153a54..9d7e0a7928ab8d 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2750,7 +2750,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop",
 def InferIntRangeType : AnyTypeOf<[AnyInteger, Index]>;
 
 def TestWithBoundsOp : TEST_Op<"with_bounds",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            NoMemoryEffect]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2762,7 +2762,7 @@ def TestWithBoundsOp : TEST_Op<"with_bounds",
 }
 
 def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            SingleBlock, NoTerminator]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2774,7 +2774,7 @@ def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
 }
 
 def TestIncrementOp : TEST_Op<"increment",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                          NoMemoryEffect, AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value);
   let results = (outs InferIntRangeType:$result);
@@ -2783,7 +2783,8 @@ def TestIncrementOp : TEST_Op<"increment",
 }
 
 def TestReflectBoundsOp : TEST_Op<"reflect_bounds",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>, AllTypesMatch<["value", "result"]>]> {
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+                          AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value,
                        OptionalAttr<APIntAttr>:$umin,
                        OptionalAttr<APIntAttr>:$umax,

From 20d497c26fc95c80a1bacb38820d92e5f52bec58 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 28 May 2024 15:33:59 -0700
Subject: [PATCH 028/230] [Driver] Remove unneeded *-linux-gnu after D158183

Recommit 435ea21c897f94b5a3777a9f152e4c5bb4a371a3.
As the comment added by a07727199db0525e9d2df41e466a2a1611b3c8e1
suggests, these `*Triples` lists should shrink over time.

https://reviews.llvm.org/D158183 allows *-unknown-linux-gnu to detect
*-linux-gnu. If we additionally allow x86_64-unknown-linux-gnu
-m32/-mx32 to detect x86_64-linux-gnu, we can mostly remove these
*-linux-gnu elements.

Retain x86_64-linux-gnu for now to work around #93609.
(In addition, Debian /usr/bin/clang --version uses x86_64-pc-linux-gnu).
Retain i586-linux-gnu for now to work around #93502.
---
 clang/lib/Driver/ToolChains/Gnu.cpp | 69 ++++++++++++++---------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9849c59685cca7..b141e5f2adfab1 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2227,10 +2227,19 @@ void Generic_GCC::GCCInstallationDetector::init(
   SmallVector<StringRef, 16> CandidateBiarchTripleAliases;
   // Add some triples that we want to check first.
   CandidateTripleAliases.push_back(TargetTriple.str());
-  std::string TripleNoVendor = TargetTriple.getArchName().str() + "-" +
-                               TargetTriple.getOSAndEnvironmentName().str();
-  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor)
+  std::string TripleNoVendor, BiarchTripleNoVendor;
+  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) {
+    StringRef OSEnv = TargetTriple.getOSAndEnvironmentName();
+    if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32)
+      OSEnv = "linux-gnu";
+    TripleNoVendor = (TargetTriple.getArchName().str() + '-' + OSEnv).str();
     CandidateTripleAliases.push_back(TripleNoVendor);
+    if (BiarchVariantTriple.getArch() != llvm::Triple::UnknownArch) {
+      BiarchTripleNoVendor =
+          (BiarchVariantTriple.getArchName().str() + '-' + OSEnv).str();
+      CandidateBiarchTripleAliases.push_back(BiarchTripleNoVendor);
+    }
+  }
 
   CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs,
                            CandidateTripleAliases, CandidateBiarchLibDirs,
@@ -2453,11 +2462,9 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
   // lists should shrink over time. Please don't add more elements to *Triples.
   static const char *const AArch64LibDirs[] = {"/lib64", "/lib"};
   static const char *const AArch64Triples[] = {
-      "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-redhat-linux",
-      "aarch64-suse-linux"};
+      "aarch64-none-linux-gnu", "aarch64-redhat-linux", "aarch64-suse-linux"};
   static const char *const AArch64beLibDirs[] = {"/lib"};
-  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu",
-                                                 "aarch64_be-linux-gnu"};
+  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu"};
 
   static const char *const ARMLibDirs[] = {"/lib"};
   static const char *const ARMTriples[] = {"arm-linux-gnueabi"};
@@ -2482,9 +2489,8 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "x86_64-linux-gnu",       "x86_64-unknown-linux-gnu",
       "x86_64-pc-linux-gnu",    "x86_64-redhat-linux6E",
       "x86_64-redhat-linux",    "x86_64-suse-linux",
-      "x86_64-manbo-linux-gnu", "x86_64-linux-gnu",
-      "x86_64-slackware-linux", "x86_64-unknown-linux",
-      "x86_64-amazon-linux"};
+      "x86_64-manbo-linux-gnu", "x86_64-slackware-linux",
+      "x86_64-unknown-linux",   "x86_64-amazon-linux"};
   static const char *const X32Triples[] = {"x86_64-linux-gnux32",
                                            "x86_64-pc-linux-gnux32"};
   static const char *const X32LibDirs[] = {"/libx32", "/lib"};
@@ -2500,26 +2506,24 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "loongarch64-linux-gnu", "loongarch64-unknown-linux-gnu"};
 
   static const char *const M68kLibDirs[] = {"/lib"};
-  static const char *const M68kTriples[] = {
-      "m68k-linux-gnu", "m68k-unknown-linux-gnu", "m68k-suse-linux"};
+  static const char *const M68kTriples[] = {"m68k-unknown-linux-gnu",
+                                            "m68k-suse-linux"};
 
   static const char *const MIPSLibDirs[] = {"/libo32", "/lib"};
   static const char *const MIPSTriples[] = {
       "mips-linux-gnu", "mips-mti-linux", "mips-mti-linux-gnu",
       "mips-img-linux-gnu", "mipsisa32r6-linux-gnu"};
   static const char *const MIPSELLibDirs[] = {"/libo32", "/lib"};
-  static const char *const MIPSELTriples[] = {
-      "mipsel-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6el-linux-gnu"};
+  static const char *const MIPSELTriples[] = {"mipsel-linux-gnu",
+                                              "mips-img-linux-gnu"};
 
   static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64Triples[] = {
-      "mips64-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",    "mips64-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64",
       "mipsisa64r6-linux-gnu", "mipsisa64r6-linux-gnuabi64"};
   static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64ELTriples[] = {
-      "mips64el-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",      "mips64el-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-gnuabi64",
       "mipsisa64r6el-linux-gnu", "mipsisa64r6el-linux-gnuabi64"};
 
   static const char *const MIPSN32LibDirs[] = {"/lib32"};
@@ -2534,46 +2538,39 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
 
   static const char *const PPCLibDirs[] = {"/lib32", "/lib"};
   static const char *const PPCTriples[] = {
-      "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe",
+      "powerpc-unknown-linux-gnu",
       // On 32-bit PowerPC systems running SUSE Linux, gcc is configured as a
       // 64-bit compiler which defaults to "-m32", hence "powerpc64-suse-linux".
       "powerpc64-suse-linux", "powerpc-montavista-linuxspe"};
   static const char *const PPCLELibDirs[] = {"/lib32", "/lib"};
-  static const char *const PPCLETriples[] = {"powerpcle-linux-gnu",
-                                             "powerpcle-unknown-linux-gnu",
+  static const char *const PPCLETriples[] = {"powerpcle-unknown-linux-gnu",
                                              "powerpcle-linux-musl"};
 
   static const char *const PPC64LibDirs[] = {"/lib64", "/lib"};
-  static const char *const PPC64Triples[] = {
-      "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu",
-      "powerpc64-suse-linux", "ppc64-redhat-linux"};
+  static const char *const PPC64Triples[] = {"powerpc64-unknown-linux-gnu",
+                                             "powerpc64-suse-linux",
+                                             "ppc64-redhat-linux"};
   static const char *const PPC64LELibDirs[] = {"/lib64", "/lib"};
   static const char *const PPC64LETriples[] = {
-      "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu",
-      "powerpc64le-none-linux-gnu", "powerpc64le-suse-linux",
-      "ppc64le-redhat-linux"};
+      "powerpc64le-unknown-linux-gnu", "powerpc64le-none-linux-gnu",
+      "powerpc64le-suse-linux", "ppc64le-redhat-linux"};
 
   static const char *const RISCV32LibDirs[] = {"/lib32", "/lib"};
   static const char *const RISCV32Triples[] = {"riscv32-unknown-linux-gnu",
-                                               "riscv32-linux-gnu",
                                                "riscv32-unknown-elf"};
   static const char *const RISCV64LibDirs[] = {"/lib64", "/lib"};
   static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu",
-                                               "riscv64-linux-gnu",
                                                "riscv64-unknown-elf"};
 
   static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"};
-  static const char *const SPARCv8Triples[] = {"sparc-linux-gnu",
-                                               "sparcv8-linux-gnu"};
+  static const char *const SPARCv8Triples[] = {"sparcv8-linux-gnu"};
   static const char *const SPARCv9LibDirs[] = {"/lib64", "/lib"};
-  static const char *const SPARCv9Triples[] = {"sparc64-linux-gnu",
-                                               "sparcv9-linux-gnu"};
+  static const char *const SPARCv9Triples[] = {"sparcv9-linux-gnu"};
 
   static const char *const SystemZLibDirs[] = {"/lib64", "/lib"};
   static const char *const SystemZTriples[] = {
-      "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu",
-      "s390x-suse-linux", "s390x-redhat-linux"};
-
+      "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux",
+      "s390x-redhat-linux"};
 
   using std::begin;
   using std::end;

From 760c2aa55f0c5f56bed944328b23aa3f2f764346 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Tue, 28 May 2024 15:37:03 -0700
Subject: [PATCH 029/230] [lld] Support thumb PLTs (#86223)

We are using PLTs for cortex-m33 which only supports thumb. More
specifically, this is for a very restricted use case. There's no MMU so
there's no sharing of virtual addresses between two processes, but this
is fine. The MCU is used for running [chre
nanoapps](https://android.googlesource.com/platform/system/chre/+/HEAD/doc/nanoapp_overview.md)
for android. Each nanoapp is a shared library (but effectively acts as
an executable containing a test suite) that is loaded and run on the MCU
one binary at a time and there's only one process running at a time, so
we ensure that the same text segment cannot be shared by two different
running executables. GNU LD supports thumb PLTs but we want to migrate
to a clang toolchain and use LLD, so thumb PLTs are needed.
---
 lld/ELF/Arch/ARM.cpp                 | 176 +++++++++++++++++++--------
 lld/ELF/Config.h                     |   1 +
 lld/ELF/InputFiles.cpp               |  12 ++
 lld/test/ELF/armv8-thumb-plt-reloc.s | 126 +++++++++++++++++++
 4 files changed, 262 insertions(+), 53 deletions(-)
 create mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 687f9499009d5e..3e0efe540e1bf1 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) {
 // The default PLT header requires the .got.plt to be within 128 Mb of the
 // .plt in the positive direction.
 void ARM::writePltHeader(uint8_t *buf) const {
-  // Use a similar sequence to that in writePlt(), the difference is the calling
-  // conventions mean we use lr instead of ip. The PLT entry is responsible for
-  // saving lr on the stack, the dynamic loader is responsible for reloading
-  // it.
-  const uint32_t pltData[] = {
-      0xe52de004, // L1: str lr, [sp,#-4]!
-      0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
-      0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
-      0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
-  };
-
-  uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltHeaderLong(buf);
-    return;
+  if (config->armThumbPLTs) {
+    // The instruction sequence for thumb:
+    //
+    // 0: b500          push    {lr}
+    // 2: f8df e008     ldr.w   lr, [pc, #0x8]          @ 0xe <func+0xe>
+    // 6: 44fe          add     lr, pc
+    // 8: f85e ff08     ldr     pc, [lr, #8]!
+    // e:               .word   .got.plt - .plt - 16
+    //
+    // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
+    // `pc` in the add instruction and 8 bytes for the `lr` adjustment.
+    //
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+    write16(buf + 0, 0xb500);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 2, 0xf8df);
+    write16(buf + 4, 0xe008);
+    write16(buf + 6, 0x44fe);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 8, 0xf85e);
+    write16(buf + 10, 0xff08);
+    write32(buf + 12, offset);
+
+    memcpy(buf + 16, trapInstr.data(), 4);  // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
+  } else {
+    // Use a similar sequence to that in writePlt(), the difference is the
+    // calling conventions mean we use lr instead of ip. The PLT entry is
+    // responsible for saving lr on the stack, the dynamic loader is responsible
+    // for reloading it.
+    const uint32_t pltData[] = {
+        0xe52de004, // L1: str lr, [sp,#-4]!
+        0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
+        0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
+        0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
+    };
+
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltHeaderLong(buf);
+      return;
+    }
+    write32(buf + 0, pltData[0]);
+    write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
+    write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
+    write32(buf + 12, pltData[3] | (offset & 0xfff));
+    memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
   }
-  write32(buf + 0, pltData[0]);
-  write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
-  write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
-  write32(buf + 12, pltData[3] | (offset & 0xfff));
-  memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
-  memcpy(buf + 20, trapInstr.data(), 4);
-  memcpy(buf + 24, trapInstr.data(), 4);
-  memcpy(buf + 28, trapInstr.data(), 4);
 }
 
 void ARM::addPltHeaderSymbols(InputSection &isec) const {
-  addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  }
 }
 
 // Long form PLT entries that do not have any restrictions on the displacement
@@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
 // .plt in the positive direction.
 void ARM::writePlt(uint8_t *buf, const Symbol &sym,
                    uint64_t pltEntryAddr) const {
-  // The PLT entry is similar to the example given in Appendix A of ELF for
-  // the Arm Architecture. Instead of using the Group Relocations to find the
-  // optimal rotation for the 8-bit immediate used in the add instructions we
-  // hard code the most compact rotations for simplicity. This saves a load
-  // instruction over the long plt sequences.
-  const uint32_t pltData[] = {
-      0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
-      0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
-      0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
-  };
 
-  uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
-    return;
+  if (!config->armThumbPLTs) {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
+
+    // The PLT entry is similar to the example given in Appendix A of ELF for
+    // the Arm Architecture. Instead of using the Group Relocations to find the
+    // optimal rotation for the 8-bit immediate used in the add instructions we
+    // hard code the most compact rotations for simplicity. This saves a load
+    // instruction over the long plt sequences.
+    const uint32_t pltData[] = {
+        0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
+        0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
+        0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
+    };
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
+      return;
+    }
+    write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
+    write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
+    write32(buf + 8, pltData[2] | (offset & 0xfff));
+    memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
+  } else {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+
+    // A PLT entry will be:
+    //
+    //       movw ip, #<lower 16 bits>
+    //       movt ip, #<upper 16 bits>
+    //       add ip, pc
+    //   L1: ldr.w pc, [ip]
+    //       b L1
+    //
+    // where ip = r12 = 0xc
+
+    // movw ip, #<lower 16 bits>
+    write16(buf + 2, 0x0c00); // use `ip`
+    relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);
+
+    // movt ip, #<upper 16 bits>
+    write16(buf + 6, 0x0c00); // use `ip`
+    relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);
+
+    write16(buf + 8, 0x44fc);       // add ip, pc
+    write16(buf + 10, 0xf8dc);      // ldr.w   pc, [ip] (bottom half)
+    write16(buf + 12, 0xf000);      // ldr.w   pc, [ip] (upper half)
+    write16(buf + 14, 0xe7fc);      // Branch to previous instruction
   }
-  write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
-  write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
-  write32(buf + 8, pltData[2] | (offset & 0xfff));
-  memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
 }
 
 void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
-  addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  }
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   case R_ARM_JUMP24:
     // Source is ARM, all PLT entries are ARM so no interworking required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
+    assert(!config->armThumbPLTs &&
+           "If the source is ARM, we should not need Thumb PLTs");
     if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
       return true;
     [[fallthrough]];
@@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   }
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
-    // Source is Thumb, all PLT entries are ARM so interworking is required.
+    // Source is Thumb, when all PLT entries are ARM interworking is required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
-    if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
+    if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
       return true;
     [[fallthrough]];
   case R_ARM_THM_CALL: {
@@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // STT_FUNC we choose whether to write a BL or BLX depending on the
     // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
     // not of type STT_FUNC then we must preserve the original instruction.
-    // PLT entries are always ARM state so we know we don't need to interwork.
     assert(rel.sym); // R_ARM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
     bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // PLT entries are always ARM state so we know we need to interwork.
     assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
+    bool useThumb = bit0Thumb || config->armThumbPLTs;
     bool isBlx = (read16(loc + 2) & 0x1000) == 0;
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
-    // even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
-    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
+    // even when type not STT_FUNC.
+    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
       stateChangeWarning(loc, rel.type, *rel.sym);
-    if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
+    if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
       // before overflow check.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index f0dfe7f377de0e..883c4a2f84294c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -217,6 +217,7 @@ struct Config {
   bool allowMultipleDefinition;
   bool fatLTOObjects;
   bool androidPackDynRelocs = false;
+  bool armThumbPLTs = false;
   bool armHasBlx = false;
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 1f496026d3ae20..d760dddcf5ec5c 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
   if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
       profile == ARMBuildAttrs::MicroControllerProfile)
     config->armCMSESupport = true;
+
+  // The thumb PLT entries require Thumb2 which can be used on multiple archs.
+  // For now, let's limit it to ones where ARM isn't available and we know have
+  // Thumb2.
+  std::optional<unsigned> armISA =
+      attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
+  std::optional<unsigned> thumb =
+      attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
+  bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
+  bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
+  if (noArmISA && hasThumb2)
+    config->armThumbPLTs = true;
 }
 
 InputFile::InputFile(Kind k, MemoryBufferRef m)
diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s
new file mode 100644
index 00000000000000..47cd5c1b741ee0
--- /dev/null
+++ b/lld/test/ELF/armv8-thumb-plt-reloc.s
@@ -0,0 +1,126 @@
+// REQUIRES: arm
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
+// RUN: ld.lld %t1 %t2 -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld -shared %t1 %t2 -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be
+// RUN: ld.lld %t1.be %t2.be -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+/// Test PLT entry generation
+ .text
+ .align 2
+ .globl _start
+ .type  _start,%function
+_start:
+ bl func1
+ bl func2
+ bl func3
+ b.w func1
+ b.w func2
+ b.w func3
+ beq.w func1
+ beq.w func2
+ beq.w func3
+
+/// Executable, expect no PLT
+// CHECK: Disassembly of section .text:
+// CHECK-EMPTY:
+// CHECK-NEXT: <func1>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func2>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func3>:
+// CHECK-NEXT:   bx      lr
+// CHECK-NEXT:   d4d4 
+// CHECK: <_start>:
+// CHECK-NEXT:   bl      {{.*}} <func1>
+// CHECK-NEXT:   bl      {{.*}} <func2>
+// CHECK-NEXT:   bl      {{.*}} <func3>
+// CHECK-NEXT:   b.w     {{.*}} <func1>
+// CHECK-NEXT:   b.w     {{.*}} <func2>
+// CHECK-NEXT:   b.w     {{.*}} <func3>
+// CHECK-NEXT:   beq.w	 {{.*}} <func1>
+// CHECK-NEXT:   beq.w	 {{.*}} <func2>
+// CHECK-NEXT:   beq.w	 {{.*}} <func3>
+
+// DSO: Disassembly of section .text:
+// DSO-EMPTY:
+// DSO-NEXT: <func1>:
+// DSO-NEXT:     bx      lr
+// DSO: <func2>:
+// DSO-NEXT:     bx      lr
+// DSO: <func3>:
+// DSO-NEXT:     bx      lr
+// DSO-NEXT:     d4d4 
+// DSO: <_start>:
+/// 0x10260 = PLT func1
+// DSO-NEXT:     bl     0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     bl     0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     bl     0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     b.w    0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     b.w    0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     b.w    0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     beq.w	 0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     beq.w	 0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     beq.w	 0x10280
+// DSO: Disassembly of section .plt:
+// DSO-EMPTY:
+// DSO-NEXT: 10240 <.plt>:
+// DSO-NEXT:     push    {lr}
+// DSO-NEXT:     ldr.w   lr, [pc, #8]
+// DSO-NEXT:     add     lr, pc
+// DSO-NEXT:     ldr     pc, [lr, #8]!
+/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
+// DSO-NEXT:     .word   0x00020098
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+
+/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1
+// DSO-NEXT:     10260:       f240 0c88     movw    r12, #136
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1026a
+/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2
+// DSO-NEXT:     10270:       f240 0c7c     movw    r12, #124
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1027a
+/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3
+// DSO-NEXT:     10280:       f240 0c70     movw    r12, #112
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1028a
+
+// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00  WA  0   0  4
+// DSOREL: Relocation section '.rel.plt'
+// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1
+// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2
+// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3

From f7c8a0339c64810a3c1b28d9b3b20e02a2be6232 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 15:54:44 -0700
Subject: [PATCH 030/230] [RISCV] Combine vXi32 (mul (and (lshr X, 15),
 0x10001), 0xffff) -> (bitcast (sra (v2Xi16 (bitcast X)), 15)) (#93565)

Similar for i16 and i64 elements for both fixed and scalable vectors.

This reduces the number of vector instructions, but increases vl/vtype
toggles.

This reduces some code in 525.x264_r from SPEC2017. In that usage, the
vectors are fixed with a small number of elements so vsetivli can be
used.

This is similar to `performMulVectorCmpZeroCombine` from AArch64.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  41 +++++++
 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll  | 117 ++++++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/mul-combine.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5fc613c1b2a140..e99c6208594e3b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) ->
+// (bitcast (sra (v2Xi16 (bitcast X)), 15))
+// Same for other equivalent types with other equivalent constants.
+static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Do this for legal vectors unless they are i1 or i8 vectors.
+  if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16)
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND ||
+      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue And = N->getOperand(0);
+  SDValue Srl = And.getOperand(0);
+
+  APInt V1, V2, V3;
+  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
+      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
+      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
+    return SDValue();
+
+  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
+  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return SDValue();
+
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
+                                VT.getVectorElementCount() * 2);
+  SDLoc DL(N);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0));
+  SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast,
+                            DAG.getConstant(HalfSize - 1, DL, HalfVT));
+  return DAG.getNode(ISD::BITCAST, DL, VT, Sra);
+}
 
 static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
 
+  if (SDValue V = combineVectorMulToSraBitcast(N, DAG))
+    return V;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
new file mode 100644
index 00000000000000..6a7da925b4d43d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
+
+define <2 x i16> @test_v2i16(<2 x i16> %x) {
+; CHECK-RV32-LABEL: test_v2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i16> %x, <i16 7, i16 7>
+  %2 = and <2 x i16> %1, <i16 257, i16 257>
+  %3 = mul <2 x i16> %2, <i16 255, i16 255>
+  ret <2 x i16> %3
+}
+
+define <vscale x 2 x i16> @test_nxv2i16(<vscale x 2 x i16> %x) {
+; CHECK-RV32-LABEL: test_nxv2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV32-NEXT:    li a0, 257
+; CHECK-RV32-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV32-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV64-NEXT:    li a0, 257
+; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV64-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i16> %x, splat (i16 7)
+  %2 = and <vscale x 2 x i16> %1, splat (i16 257)
+  %3 = mul <vscale x 2 x i16> %2, splat (i16 256)
+  ret <vscale x 2 x i16> %3
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %x) {
+; CHECK-RV32-LABEL: test_v2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i32> %x, <i32 15, i32 15>
+  %2 = and <2 x i32> %1, <i32 65537, i32 65537>
+  %3 = mul <2 x i32> %2, <i32 65535, i32 65535>
+  ret <2 x i32> %3
+}
+
+define <vscale x 2 x i32> @test_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-RV32-LABEL: test_nxv2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i32> %x, splat (i32 15)
+  %2 = and <vscale x 2 x i32> %1, splat (i32 65537)
+  %3 = mul <vscale x 2 x i32> %2, splat (i32 65535)
+  ret <vscale x 2 x i32> %3
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %x) {
+; CHECK-RV32-LABEL: test_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i64> %x, <i64 31, i64 31>
+  %2 = and <2 x i64> %1, <i64 4294967297, i64 4294967297>
+  %3 = mul <2 x i64> %2, <i64 4294967295, i64 4294967295>
+  ret <2 x i64> %3
+}
+
+define <vscale x 2 x i64> @test_nxv2i64(<vscale x 2 x i64> %x) {
+; CHECK-RV32-LABEL: test_nxv2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i64> %x, splat (i64 31)
+  %2 = and <vscale x 2 x i64> %1, splat (i64 4294967297)
+  %3 = mul <vscale x 2 x i64> %2, splat (i64 4294967295)
+  ret <vscale x 2 x i64> %3
+}

From 0694552cb7e8b2041fd5e765cf5b83fc40664087 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 28 May 2024 15:56:17 -0700
Subject: [PATCH 031/230] [libc] clean up MutexLock (#93619)

---
 libc/src/__support/threads/linux/CMakeLists.txt |  1 +
 libc/src/__support/threads/linux/CndVar.cpp     |  7 ++++---
 libc/src/__support/threads/mutex.h              | 14 --------------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 39c4ad20201ca6..f6913ef0834289 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -75,4 +75,5 @@ add_object_library(
     libc.src.__support.OSUtil.osutil
     libc.src.__support.threads.linux.futex_word_type
     libc.src.__support.threads.mutex
+    libc.src.__support.CPP.mutex
 )
diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp
index daf56bca1ed21b..b3a0fdbda4e9ea 100644
--- a/libc/src/__support/threads/linux/CndVar.cpp
+++ b/libc/src/__support/threads/linux/CndVar.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/CndVar.h"
+#include "src/__support/CPP/mutex.h"
 #include "src/__support/OSUtil/syscall.h"           // syscall_impl
 #include "src/__support/threads/linux/futex_word.h" // FutexWordType
-#include "src/__support/threads/mutex.h"            // Mutex, MutexLock
+#include "src/__support/threads/mutex.h"            // Mutex
 
 #include <sys/syscall.h> // For syscall numbers.
 
@@ -27,7 +28,7 @@ int CndVar::wait(Mutex *m) {
 
   CndWaiter waiter;
   {
-    MutexLock ml(&qmtx);
+    cpp::lock_guard ml(qmtx);
     CndWaiter *old_back = nullptr;
     if (waitq_front == nullptr) {
       waitq_front = waitq_back = &waiter;
@@ -83,7 +84,7 @@ void CndVar::notify_one() {
 }
 
 void CndVar::broadcast() {
-  MutexLock ml(&qmtx);
+  cpp::lock_guard ml(qmtx);
   uint32_t dummy_futex_word;
   CndWaiter *waiter = waitq_front;
   waitq_front = waitq_back = nullptr;
diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h
index 9dded2e3f952a1..392b38984dc0ae 100644
--- a/libc/src/__support/threads/mutex.h
+++ b/libc/src/__support/threads/mutex.h
@@ -43,18 +43,4 @@
 #include "src/__support/threads/gpu/mutex.h"
 #endif // __linux__
 
-namespace LIBC_NAMESPACE {
-
-// An RAII class for easy locking and unlocking of mutexes.
-class MutexLock {
-  Mutex *mutex;
-
-public:
-  explicit MutexLock(Mutex *m) : mutex(m) { mutex->lock(); }
-
-  ~MutexLock() { mutex->unlock(); }
-};
-
-} // namespace LIBC_NAMESPACE
-
 #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_MUTEX_H

From c179d50fd3d84311708701d84e3bca60570d3d7f Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 16:10:11 -0700
Subject: [PATCH 032/230] [WebAssembly] Add exnref type (#93586)

This adds (back) the exnref type restored in the new EH proposal adopted
in Oct 2023 CG meeting:

https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md:x
---
 lld/wasm/WriterUtils.cpp                      |  2 ++
 llvm/include/llvm/BinaryFormat/Wasm.h         |  9 ++++---
 llvm/include/llvm/CodeGen/ValueTypes.td       |  9 ++++---
 llvm/include/llvm/IR/Intrinsics.td            |  2 ++
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 18 +++++++++++++
 llvm/lib/CodeGen/ValueTypes.cpp               |  1 +
 llvm/lib/Object/WasmObjectFile.cpp            |  8 ++++--
 llvm/lib/ObjectYAML/WasmYAML.cpp              |  2 ++
 .../MCTargetDesc/WebAssemblyMCTargetDesc.h    | 12 +++++++++
 .../WebAssemblyMCTypeUtilities.cpp            |  6 +++++
 .../MCTargetDesc/WebAssemblyMCTypeUtilities.h |  4 ++-
 .../Utils/WebAssemblyTypeUtilities.cpp        |  3 +++
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     |  2 ++
 .../WebAssembly/WebAssemblyExplicitLocals.cpp | 10 +++++++
 .../WebAssembly/WebAssemblyFastISel.cpp       | 16 ++++++++++++
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  3 +++
 .../WebAssembly/WebAssemblyInstrInfo.td       |  3 +++
 .../Target/WebAssembly/WebAssemblyInstrRef.td |  8 +++---
 .../WebAssembly/WebAssemblyInstrTable.td      |  2 ++
 .../WebAssembly/WebAssemblyRegStackify.cpp    |  2 ++
 .../WebAssembly/WebAssemblyRegisterInfo.td    |  2 ++
 .../WebAssembly/WebAssemblyUtilities.cpp      |  2 ++
 .../test/CodeGen/WebAssembly/reg-argument.mir | 11 ++++++++
 llvm/test/CodeGen/WebAssembly/reg-copy.mir    | 11 ++++++++
 llvm/test/MC/WebAssembly/basic-assembly.s     | 21 +++++++++------
 llvm/test/MC/WebAssembly/reference-types.s    | 26 +++++++++++++++++--
 .../test/MC/WebAssembly/type-checker-errors.s | 16 ++++++++++++
 27 files changed, 188 insertions(+), 23 deletions(-)

diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp
index cdd2c42f939efe..c6a1592012e64c 100644
--- a/lld/wasm/WriterUtils.cpp
+++ b/lld/wasm/WriterUtils.cpp
@@ -35,6 +35,8 @@ std::string toString(ValType type) {
     return "funcref";
   case ValType::EXTERNREF:
     return "externref";
+  case ValType::EXNREF:
+    return "exnref";
   case ValType::OTHERREF:
     return "otherref";
   }
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 38ef8e37df91d3..acf89885af6fdb 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -58,15 +58,16 @@ enum : unsigned {
   WASM_TYPE_V128 = 0x7B,
   WASM_TYPE_NULLFUNCREF = 0x73,
   WASM_TYPE_NULLEXTERNREF = 0x72,
+  WASM_TYPE_NULLEXNREF = 0x74,
   WASM_TYPE_NULLREF = 0x71,
   WASM_TYPE_FUNCREF = 0x70,
   WASM_TYPE_EXTERNREF = 0x6F,
+  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_ANYREF = 0x6E,
   WASM_TYPE_EQREF = 0x6D,
   WASM_TYPE_I31REF = 0x6C,
   WASM_TYPE_STRUCTREF = 0x6B,
   WASM_TYPE_ARRAYREF = 0x6A,
-  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_NONNULLABLE = 0x64,
   WASM_TYPE_NULLABLE = 0x63,
   WASM_TYPE_FUNC = 0x60,
@@ -261,8 +262,9 @@ enum class ValType {
   V128 = WASM_TYPE_V128,
   FUNCREF = WASM_TYPE_FUNCREF,
   EXTERNREF = WASM_TYPE_EXTERNREF,
+  EXNREF = WASM_TYPE_EXNREF,
   // Unmodeled value types include ref types with heap types other than
-  // func or extern, and type-specialized funcrefs
+  // func, extern or exn, and type-specialized funcrefs
   OTHERREF = 0xff,
 };
 
@@ -410,7 +412,8 @@ struct WasmDataSegment {
 // 1) Does not model passive or declarative segments (Segment will end up with
 // an Offset field of i32.const 0)
 // 2) Does not model init exprs (Segment will get an empty Functions list)
-// 2) Does not model types other than basic funcref/externref (see ValType)
+// 3) Does not model types other than basic funcref/externref/exnref (see
+// ValType)
 struct WasmElemSegment {
   uint32_t Flags;
   uint32_t TableNumber;
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index c3e378ed8f6edb..e322cc04c1c769 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -280,11 +280,12 @@ def untyped   : ValueType<8,    193> { // Produces an untyped value
 }
 def funcref   : ValueType<0,    194>;  // WebAssembly's funcref type
 def externref : ValueType<0,    195>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 196>;  // X86 AMX value
-def i64x8     : ValueType<512,  197>;  // 8 Consecutive GPRs (AArch64)
+def exnref    : ValueType<0,    196>;  // WebAssembly's exnref type
+def x86amx    : ValueType<8192, 197>;  // X86 AMX value
+def i64x8     : ValueType<512,  198>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  198>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type
+              : ValueType<16,  199>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3019f68083d422..c3ac53837444ef 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -316,6 +316,7 @@ def IIT_PPCF128 : IIT_VT<ppcf128, 52>;
 def IIT_V3 : IIT_Vec<3, 53>;
 def IIT_EXTERNREF : IIT_VT<externref, 54>;
 def IIT_FUNCREF : IIT_VT<funcref, 55>;
+def IIT_EXNREF: IIT_VT<exnref, 56>;
 def IIT_I2 : IIT_Int<2, 57>;
 def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
@@ -581,6 +582,7 @@ def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 
 def llvm_externref_ty  : LLVMType<externref>;
 def llvm_funcref_ty    : LLVMType<funcref>;
+def llvm_exnref_ty     : LLVMType<exnref>;
 
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 237f268784bb02..47aab196a6d4f9 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -31,12 +31,17 @@ def int_wasm_ref_null_extern :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_null_func :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [], [IntrNoMem]>;
+def int_wasm_ref_null_exn:
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_is_null_extern :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_externref_ty], [IntrNoMem],
                         "llvm.wasm.ref.is_null.extern">;
 def int_wasm_ref_is_null_func :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_funcref_ty],
                         [IntrNoMem], "llvm.wasm.ref.is_null.func">;
+def int_wasm_ref_is_null_exn :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem],
+                        "llvm.wasm.ref.is_null.exn">;
 
 //===----------------------------------------------------------------------===//
 // Table intrinsics
@@ -47,6 +52,9 @@ def int_wasm_table_set_externref :
 def int_wasm_table_set_funcref :
   DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty],
                         [IntrWriteMem]>;
+def int_wasm_table_set_exnref :
+  DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty],
+                        [IntrWriteMem]>;
 
 def int_wasm_table_get_externref :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [llvm_table_ty, llvm_i32_ty],
@@ -54,6 +62,9 @@ def int_wasm_table_get_externref :
 def int_wasm_table_get_funcref :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [llvm_table_ty, llvm_i32_ty],
                         [IntrReadMem]>;
+def int_wasm_table_get_exnref :
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [llvm_table_ty, llvm_i32_ty],
+                        [IntrReadMem]>;
 
 // Query the current table size, and increase the current table size.
 def int_wasm_table_size :
@@ -68,6 +79,9 @@ def int_wasm_table_grow_externref :
 def int_wasm_table_grow_funcref :
   DefaultAttrsIntrinsic<[llvm_i32_ty],
                         [llvm_table_ty, llvm_funcref_ty, llvm_i32_ty], []>;
+def int_wasm_table_grow_exnref :
+  DefaultAttrsIntrinsic<[llvm_i32_ty],
+                        [llvm_table_ty, llvm_exnref_ty, llvm_i32_ty], []>;
 def int_wasm_table_fill_externref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_externref_ty,
@@ -76,6 +90,10 @@ def int_wasm_table_fill_funcref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty,
                          llvm_i32_ty], []>;
+def int_wasm_table_fill_exnref :
+  DefaultAttrsIntrinsic<[],
+                        [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty,
+                         llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Trapping float-to-int conversions
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 3d5c58d282da56..df1c02c3dc67c2 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -181,6 +181,7 @@ std::string EVT::getEVTString() const {
   case MVT::Metadata:  return "Metadata";
   case MVT::Untyped:   return "Untyped";
   case MVT::funcref:   return "funcref";
+  case MVT::exnref:    return "exnref";
   case MVT::externref: return "externref";
   case MVT::aarch64svcount:
     return "aarch64svcount";
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 6507a0e5950ebe..23381955c60a88 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -177,8 +177,8 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
 
 static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
                                   uint32_t Code) {
-  // only directly encoded FUNCREF/EXTERNREF are supported
-  // (not ref null func or ref null extern)
+  // only directly encoded FUNCREF/EXTERNREF/EXNREF are supported
+  // (not ref null func, ref null extern, or ref null exn)
   switch (Code) {
   case wasm::WASM_TYPE_I32:
   case wasm::WASM_TYPE_I64:
@@ -187,6 +187,7 @@ static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
   case wasm::WASM_TYPE_V128:
   case wasm::WASM_TYPE_FUNCREF:
   case wasm::WASM_TYPE_EXTERNREF:
+  case wasm::WASM_TYPE_EXNREF:
     return wasm::ValType(Code);
   }
   if (Code == wasm::WASM_TYPE_NULLABLE || Code == wasm::WASM_TYPE_NONNULLABLE) {
@@ -1288,6 +1289,7 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
       auto ElemType = Im.Table.ElemType;
       if (ElemType != wasm::ValType::FUNCREF &&
           ElemType != wasm::ValType::EXTERNREF &&
+          ElemType != wasm::ValType::EXNREF &&
           ElemType != wasm::ValType::OTHERREF)
         return make_error<GenericBinaryError>("invalid table element type",
                                               object_error::parse_failed);
@@ -1346,6 +1348,7 @@ Error WasmObjectFile::parseTableSection(ReadContext &Ctx) {
     auto ElemType = Tables.back().Type.ElemType;
     if (ElemType != wasm::ValType::FUNCREF &&
         ElemType != wasm::ValType::EXTERNREF &&
+        ElemType != wasm::ValType::EXNREF &&
         ElemType != wasm::ValType::OTHERREF) {
       return make_error<GenericBinaryError>("invalid table element type",
                                             object_error::parse_failed);
@@ -1680,6 +1683,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
         Segment.ElemKind = parseValType(Ctx, ElemKind);
         if (Segment.ElemKind != wasm::ValType::FUNCREF &&
             Segment.ElemKind != wasm::ValType::EXTERNREF &&
+            Segment.ElemKind != wasm::ValType::EXNREF &&
             Segment.ElemKind != wasm::ValType::OTHERREF) {
           return make_error<GenericBinaryError>("invalid elem type",
                                                 object_error::parse_failed);
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 544a91d03dce01..7ad338f65706d5 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -606,6 +606,7 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(V128);
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
@@ -640,6 +641,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
 #define ECase(X) IO.enumCase(Type, #X, CONCAT(X));
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 34502170a5c71f..b7498cb4299452 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -355,6 +355,8 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_funcref_S:
   case WebAssembly::ARGUMENT_externref:
   case WebAssembly::ARGUMENT_externref_S:
+  case WebAssembly::ARGUMENT_exnref:
+  case WebAssembly::ARGUMENT_exnref_S:
     return true;
   default:
     return false;
@@ -377,6 +379,8 @@ inline bool isCopy(unsigned Opc) {
   case WebAssembly::COPY_FUNCREF_S:
   case WebAssembly::COPY_EXTERNREF:
   case WebAssembly::COPY_EXTERNREF_S:
+  case WebAssembly::COPY_EXNREF:
+  case WebAssembly::COPY_EXNREF_S:
     return true;
   default:
     return false;
@@ -399,6 +403,8 @@ inline bool isTee(unsigned Opc) {
   case WebAssembly::TEE_FUNCREF_S:
   case WebAssembly::TEE_EXTERNREF:
   case WebAssembly::TEE_EXTERNREF_S:
+  case WebAssembly::TEE_EXNREF:
+  case WebAssembly::TEE_EXNREF_S:
     return true;
   default:
     return false;
@@ -489,6 +495,8 @@ inline bool isLocalGet(unsigned Opc) {
   case WebAssembly::LOCAL_GET_FUNCREF_S:
   case WebAssembly::LOCAL_GET_EXTERNREF:
   case WebAssembly::LOCAL_GET_EXTERNREF_S:
+  case WebAssembly::LOCAL_GET_EXNREF:
+  case WebAssembly::LOCAL_GET_EXNREF_S:
     return true;
   default:
     return false;
@@ -511,6 +519,8 @@ inline bool isLocalSet(unsigned Opc) {
   case WebAssembly::LOCAL_SET_FUNCREF_S:
   case WebAssembly::LOCAL_SET_EXTERNREF:
   case WebAssembly::LOCAL_SET_EXTERNREF_S:
+  case WebAssembly::LOCAL_SET_EXNREF:
+  case WebAssembly::LOCAL_SET_EXNREF_S:
     return true;
   default:
     return false;
@@ -533,6 +543,8 @@ inline bool isLocalTee(unsigned Opc) {
   case WebAssembly::LOCAL_TEE_FUNCREF_S:
   case WebAssembly::LOCAL_TEE_EXTERNREF:
   case WebAssembly::LOCAL_TEE_EXTERNREF_S:
+  case WebAssembly::LOCAL_TEE_EXNREF:
+  case WebAssembly::LOCAL_TEE_EXNREF_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index 8ea02bd2ad1ff0..d9c8e22bbbaf5b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -27,6 +27,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
              wasm::ValType::V128)
       .Case("funcref", wasm::ValType::FUNCREF)
       .Case("externref", wasm::ValType::EXTERNREF)
+      .Case("exnref", wasm::ValType::EXNREF)
       .Default(std::nullopt);
 }
 
@@ -40,6 +41,7 @@ WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
       .Case("v128", WebAssembly::BlockType::V128)
       .Case("funcref", WebAssembly::BlockType::Funcref)
       .Case("externref", WebAssembly::BlockType::Externref)
+      .Case("exnref", WebAssembly::BlockType::Exnref)
       .Case("void", WebAssembly::BlockType::Void)
       .Default(WebAssembly::BlockType::Invalid);
 }
@@ -62,6 +64,8 @@ const char *WebAssembly::anyTypeToString(unsigned Type) {
     return "funcref";
   case wasm::WASM_TYPE_EXTERNREF:
     return "externref";
+  case wasm::WASM_TYPE_EXNREF:
+    return "exnref";
   case wasm::WASM_TYPE_FUNC:
     return "func";
   case wasm::WASM_TYPE_NORESULT:
@@ -110,6 +114,8 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
     return wasm::ValType::FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return wasm::ValType::EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
index 486cf264d13e2f..063ee4dba9068e 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
@@ -32,6 +32,7 @@ enum class BlockType : unsigned {
   V128 = unsigned(wasm::ValType::V128),
   Externref = unsigned(wasm::ValType::EXTERNREF),
   Funcref = unsigned(wasm::ValType::FUNCREF),
+  Exnref = unsigned(wasm::ValType::EXNREF),
   // Multivalue blocks (and other non-void blocks) are only emitted when the
   // blocks will never be exited and are at the ends of functions (see
   // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
@@ -41,7 +42,8 @@ enum class BlockType : unsigned {
 };
 
 inline bool isRefType(wasm::ValType Type) {
-  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF;
+  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF ||
+         Type == wasm::ValType::EXNREF;
 }
 
 // Convert ValType or a list/signature of ValTypes to a string.
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 867953b4e8d71d..f9293460e701a0 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -33,6 +33,7 @@ MVT WebAssembly::parseMVT(StringRef Type) {
       .Case("v2i64", MVT::v2i64)
       .Case("funcref", MVT::funcref)
       .Case("externref", MVT::externref)
+      .Case("exnref", MVT::exnref)
       .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
 }
 
@@ -58,6 +59,8 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
     return wasm::ValType::FUNCREF;
   case MVT::externref:
     return wasm::ValType::EXTERNREF;
+  case MVT::exnref:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 443558537da245..0b7ec6e74cab20 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -125,6 +125,8 @@ static char getInvokeSig(wasm::ValType VT) {
     return 'F';
   case wasm::ValType::EXTERNREF:
     return 'X';
+  case wasm::ValType::EXNREF:
+    return 'E';
   default:
     llvm_unreachable("Unhandled wasm::ValType enum");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 0159c44a79b76d..3c6a29311a10e4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -100,6 +100,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::DROP_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::DROP_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::DROP_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -119,6 +121,8 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_GET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_GET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_GET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -138,6 +142,8 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_SET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_SET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_SET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -157,6 +163,8 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_TEE_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_TEE_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -176,6 +184,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
     return MVT::funcref;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return MVT::externref;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return MVT::exnref;
   llvm_unreachable("unrecognized register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 26e13948bc9a68..aa3aa1b007a530 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -137,6 +137,10 @@ class WebAssemblyFastISel final : public FastISel {
       if (Subtarget->hasReferenceTypes())
         return VT;
       break;
+    case MVT::exnref:
+      if (Subtarget->hasReferenceTypes() && Subtarget->hasExceptionHandling())
+        return VT;
+      break;
     case MVT::f16:
       return MVT::f32;
     case MVT::v16i8:
@@ -717,6 +721,10 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       Opc = WebAssembly::ARGUMENT_externref;
       RC = &WebAssembly::EXTERNREFRegClass;
       break;
+    case MVT::exnref:
+      Opc = WebAssembly::ARGUMENT_exnref;
+      RC = &WebAssembly::EXNREFRegClass;
+      break;
     default:
       return false;
     }
@@ -821,6 +829,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     case MVT::externref:
       ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass);
       break;
+    case MVT::exnref:
+      ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
+      break;
     default:
       return false;
     }
@@ -948,6 +959,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     Opc = WebAssembly::SELECT_EXTERNREF;
     RC = &WebAssembly::EXTERNREFRegClass;
     break;
+  case MVT::exnref:
+    Opc = WebAssembly::SELECT_EXNREF;
+    RC = &WebAssembly::EXNREFRegClass;
+    break;
   default:
     return false;
   }
@@ -1355,6 +1370,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   case MVT::v2f64:
   case MVT::funcref:
   case MVT::externref:
+  case MVT::exnref:
     break;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 518b6932a0c879..f9f16498bb390c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -76,6 +76,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   if (Subtarget->hasReferenceTypes()) {
     addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
     addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
+    if (Subtarget->hasExceptionHandling()) {
+      addRegisterClass(MVT::exnref, &WebAssembly::EXNREFRegClass);
+    }
   }
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index c1a5a45395e87d..3d37eb2fa27bce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -292,6 +292,7 @@ defm "": ARGUMENT<F32, f32>;
 defm "": ARGUMENT<F64, f64>;
 defm "": ARGUMENT<FUNCREF, funcref>;
 defm "": ARGUMENT<EXTERNREF, externref>;
+defm "": ARGUMENT<EXNREF, exnref>;
 
 // local.get and local.set are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
@@ -375,6 +376,8 @@ defm "" : LOCAL<F64, global_op32>;
 defm "" : LOCAL<V128, global_op32>, Requires<[HasSIMD128]>;
 defm "" : LOCAL<FUNCREF, global_op32>, Requires<[HasReferenceTypes]>;
 defm "" : LOCAL<EXTERNREF, global_op32>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXNREF, global_op32>,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 608963d588635e..2654a09387fd4a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -17,8 +17,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
                         [(set rc:$dst, (!cast<Intrinsic>("int_wasm_ref_null_" # ht)))],
                         "ref.null_" # ht # "$dst",
                         "ref.null_" # ht,
-                        !cond(!eq(ht, "func")   : 0xd070, 
-                              !eq(ht, "extern") : 0xd06f)>,
+                        !cond(!eq(ht, "func")   : 0xd070,
+                              !eq(ht, "extern") : 0xd06f,
+                              !eq(ht, "exn")    : 0xd069)>,
                       Requires<[HasReferenceTypes]>;
   defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond),
                      (outs), (ins),
@@ -37,8 +38,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
 
 defm "" : REF_I<FUNCREF, funcref, "func">;
 defm "" : REF_I<EXTERNREF, externref, "extern">;
+defm "" : REF_I<EXNREF, exnref, "exn">;
 
-foreach rc = [FUNCREF, EXTERNREF] in {
+foreach rc = [FUNCREF, EXTERNREF, EXNREF] in {
 def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs),
           (!cast<Instruction>("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index 069ce5e3bc94a9..02f0ab8577c3d0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -64,6 +64,8 @@ multiclass TABLE<WebAssemblyRegClass rc, string suffix> {
 
 defm "" : TABLE<FUNCREF, "funcref">, Requires<[HasReferenceTypes]>;
 defm "" : TABLE<EXTERNREF, "externref">, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXNREF, "exnref">,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
           (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index ef174e1716ef1e..d4edb6bf18d932 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -504,6 +504,8 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::TEE_EXTERNREF;
   if (RC == &WebAssembly::FUNCREFRegClass)
     return WebAssembly::TEE_FUNCREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 4e2faa608be077..17889dacc868c2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -45,6 +45,7 @@ def V128_0: WebAssemblyReg<"%v128">;
 
 def FUNCREF_0 : WebAssemblyReg<"%funcref.0">;
 def EXTERNREF_0 : WebAssemblyReg<"%externref.0">;
+def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
 
 // The value stack "register". This is an opaque entity which serves to order
 // uses and defs that must remain in LIFO order.
@@ -68,3 +69,4 @@ def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
                                128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
+def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 60e872549f87d9..5e7279808cce63 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -175,6 +175,8 @@ unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) {
     return WebAssembly::COPY_FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return WebAssembly::COPY_EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return WebAssembly::COPY_EXNREF;
   default:
     llvm_unreachable("Unexpected register class");
   }
diff --git a/llvm/test/CodeGen/WebAssembly/reg-argument.mir b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
index 23e66dfc71fa1b..a549990bdb0a2b 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-argument.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
@@ -68,3 +68,14 @@ body: |
     %1:externref = ARGUMENT_externref 0, implicit $arguments
     RETURN implicit-def $arguments
 ...
+---
+name: argument_exnref
+# CHECK-LABEL: argument_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %1:exnref = ARGUMENT_exnref 0
+  bb.0:
+    %0:i32 = CONST_I32 0, implicit-def $arguments
+    %1:exnref = ARGUMENT_exnref 0, implicit $arguments
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/CodeGen/WebAssembly/reg-copy.mir b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
index 31a5bfa63a4ea2..763fe42d07b61a 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-copy.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
@@ -77,3 +77,14 @@ body: |
     %0:externref = COPY %1:externref
     RETURN implicit-def $arguments
 ...
+---
+name: copy_exnref
+# CHECK-LABEL: copy_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:exnref = COPY_EXNREF %1:exnref
+  ; CHECK-NEXT: RETURN
+  bb.0:
+    %0:exnref = COPY %1:exnref
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s
index 769cd7edfa8a3e..ac358c1b5c7a52 100644
--- a/llvm/test/MC/WebAssembly/basic-assembly.s
+++ b/llvm/test/MC/WebAssembly/basic-assembly.s
@@ -146,12 +146,14 @@ test0:
 
     .ident      "clang version 9.0.0 (trunk 364502) (llvm/trunk 364571)"
 
-.tabletype empty_eref_table, externref
-empty_eref_table:
+.tabletype empty_externref_table, externref
+empty_externref_table:
 
-.tabletype empty_fref_table, funcref
-empty_fref_table:
+.tabletype empty_funcref_table, funcref
+empty_funcref_table:
 
+.tabletype empty_exnref_table, exnref
+empty_exnref_table:
 
 # CHECK:           .text
 # CHECK:           .globaltype __stack_pointer, i32
@@ -283,8 +285,11 @@ empty_fref_table:
 # CHECK-NEXT:      .p2align    2
 # CHECK-NEXT:      .int32      test0
 
-# CHECK:           .tabletype empty_eref_table, externref
-# CHECK-NEXT: empty_eref_table:
+# CHECK:           .tabletype empty_externref_table, externref
+# CHECK-NEXT: empty_externref_table:
 
-# CHECK:           .tabletype empty_fref_table, funcref
-# CHECK-NEXT: empty_fref_table:
+# CHECK:           .tabletype empty_funcref_table, funcref
+# CHECK-NEXT: empty_funcref_table:
+
+# CHECK:           .tabletype empty_exnref_table, exnref
+# CHECK-NEXT: empty_exnref_table:
diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s
index ab3e3ee6b155b1..2f8bfba68dcea1 100644
--- a/llvm/test/MC/WebAssembly/reference-types.s
+++ b/llvm/test/MC/WebAssembly/reference-types.s
@@ -4,22 +4,27 @@
 # CHECK-LABEL:ref_is_null:
 # CHECK: ref.is_null     # encoding: [0xd1]
 ref_is_null:
-  .functype ref_is_null () -> (i32, i32)
+  .functype ref_is_null () -> (i32, i32, i32)
   ref.null_extern
   ref.is_null
   ref.null_func
   ref.is_null
+  ref.null_exn
+  ref.is_null
   end_function
 
 # CHECK-LABEL: ref_null_test:
 # CHECK: ref.null_func   # encoding: [0xd0,0x70]
 # CHECK: ref.null_extern # encoding: [0xd0,0x6f]
+# CHECK: ref.null_exn    # encoding: [0xd0,0x69]
 ref_null_test:
   .functype ref_null_test () -> ()
   ref.null_func
   drop
   ref.null_extern
   drop
+  ref.null_exn
+  drop
   end_function
 
 # CHECK-LABEL: ref_sig_test_funcref:
@@ -36,9 +41,17 @@ ref_sig_test_externref:
   local.get 0
   end_function
 
+# CHECK-LABEL: ref_sig_test_exnref:
+# CHECK-NEXT: .functype ref_sig_test_exnref (exnref) -> (exnref)
+ref_sig_test_exnref:
+  .functype ref_sig_test_exnref (exnref) -> (exnref)
+  local.get 0
+  end_function
+
 # CHECK-LABEL: ref_select_test:
 # CHECK: funcref.select   # encoding: [0x1b]
 # CHECK: externref.select # encoding: [0x1b]
+# CHECK: exnref.select    # encoding: [0x1b]
 ref_select_test:
   .functype ref_select_test () -> ()
   ref.null_func
@@ -51,15 +64,24 @@ ref_select_test:
   i32.const 0
   externref.select
   drop
+  ref.null_exn
+  ref.null_exn
+  i32.const 0
+  exnref.select
+  drop
   end_function
 
 # CHECK-LABEL: ref_block_test:
 # CHECK: block funcref
 # CHECK: block externref
+# CHECK: block exnref
 ref_block_test:
-  .functype ref_block_test () -> (externref, funcref)
+  .functype ref_block_test () -> (exnref, externref, funcref)
   block funcref
   block externref
+  block exnref
+  ref.null_exn
+  end_block
   ref.null_extern
   end_block
   ref.null_func
diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s
index 5e28d117501e98..d2841250137a8c 100644
--- a/llvm/test/MC/WebAssembly/type-checker-errors.s
+++ b/llvm/test/MC/WebAssembly/type-checker-errors.s
@@ -215,6 +215,22 @@ table_fill_type_mismatch_3:
   table.fill valid_table
   end_function
 
+table_fill_type_mismatch_4:
+  .functype table_fill_type_mismatch_4 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
+table_fill_type_mismatch_5:
+  .functype table_fill_type_mismatch_5 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
 table_grow_non_exist_table:
   .functype table_grow_non_exist_table (externref, i32) -> (i32)
   local.get 0

From 4486fcba756bfa4c8729673a9533578232f0bc04 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 28 May 2024 19:14:26 -0400
Subject: [PATCH 033/230] [libc] Add proxy header for float.h. (#93504)

This is the continuation of
https://github.com/llvm/llvm-project/pull/88674.

Fixes #88433, #90496.

---------

Co-authored-by: aniplcc <aniplccode@gmail.com>
---
 libc/hdr/CMakeLists.txt                       | 10 ++++++
 libc/hdr/float_macros.h                       | 22 ++++++++++++
 libc/include/llvm-libc-macros/float-macros.h  | 35 ++++++++-----------
 .../macros/properties/CMakeLists.txt          |  2 +-
 libc/src/__support/macros/properties/types.h  |  2 +-
 libc/src/math/generic/CMakeLists.txt          |  4 +++
 libc/src/math/generic/scalbn.cpp              | 11 +++---
 libc/src/math/generic/scalbnf.cpp             | 11 +++---
 libc/src/math/generic/scalbnf128.cpp          | 13 +++----
 libc/src/math/generic/scalbnl.cpp             | 11 +++---
 .../llvm-project-overlay/libc/BUILD.bazel     |  7 +++-
 11 files changed, 78 insertions(+), 50 deletions(-)
 create mode 100644 libc/hdr/float_macros.h

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 91b8cb71552a71..66b82c84dac499 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -87,4 +87,14 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.time_macros
 )
 
+add_proxy_header_library(
+  float_macros
+  HDRS
+    float_macros.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float_macros
+  FULL_BUILD_DEPENDS
+    libc.include.float
+)
+
 add_subdirectory(types)
diff --git a/libc/hdr/float_macros.h b/libc/hdr/float_macros.h
new file mode 100644
index 00000000000000..a0ef5e29b98687
--- /dev/null
+++ b/libc/hdr/float_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from math.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_FLOAT_MACROS_H
+#define LLVM_LIBC_HDR_FLOAT_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/float-macros.h"
+
+#else // Overlay mode
+
+#include <float.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_FLOAT_MACROS_H
diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h
index 4fe8590c5f70c8..81c1df868bf6cd 100644
--- a/libc/include/llvm-libc-macros/float-macros.h
+++ b/libc/include/llvm-libc-macros/float-macros.h
@@ -9,21 +9,6 @@
 #ifndef LLVM_LIBC_MACROS_FLOAT_MACROS_H
 #define LLVM_LIBC_MACROS_FLOAT_MACROS_H
 
-// Suppress `#include_next is a language extension` warnings.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-include-next"
-#pragma clang diagnostic ignored "-Winclude-next-absolute-path"
-#else // gcc
-#pragma GCC system_header
-#endif //__clang__
-
-#include_next <float.h>
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif //__clang__
-
 #ifndef FLT_RADIX
 #define FLT_RADIX __FLT_RADIX__
 #endif // FLT_RADIX
@@ -32,9 +17,13 @@
 #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
 #endif // FLT_EVAL_METHOD
 
-#ifndef DECIMAL_DIG
-#define DECIMAL_DIG __DECIMAL_DIG__
-#endif // DECIMAL_DIG
+#ifndef FLT_ROUNDS
+#if __has_builtin(__builtin_flt_rounds)
+#define FLT_ROUNDS __builtin_flt_rounds()
+#else
+#define FLT_ROUNDS 1
+#endif
+#endif // FLT_ROUNDS
 
 #ifndef FLT_DECIMAL_DIG
 #define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
@@ -48,6 +37,10 @@
 #define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
 #endif // LDBL_DECIMAL_DIG
 
+#ifndef DECIMAL_DIG
+#define DECIMAL_DIG __DECIMAL_DIG__
+#endif // DECIMAL_DIG
+
 #ifndef FLT_DIG
 #define FLT_DIG __FLT_DIG__
 #endif // FLT_DIG
@@ -97,15 +90,15 @@
 #endif // LDBL_MAX
 
 #ifndef FLT_TRUE_MIN
-#define FLT_TRUE_MIN __FLT_TRUE_MIN__
+#define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #endif // FLT_TRUE_MIN
 
 #ifndef DBL_TRUE_MIN
-#define DBL_TRUE_MIN __DBL_TRUE_MIN__
+#define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #endif // DBL_TRUE_MIN
 
 #ifndef LDBL_TRUE_MIN
-#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__
+#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
 #endif // LDBL_TRUE_MIN
 
 #ifndef FLT_EPSILON
diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index bbc45650f3fca3..7718aeaa3de5af 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -33,6 +33,6 @@ add_header_library(
     .compiler
     .cpu_features
     .os
-    libc.include.llvm-libc-macros.float_macros
+    libc.hdr.float_macros
     libc.include.llvm-libc-types.float128
 )
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index d43cf99e6859be..781cf1b7a2b627 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "hdr/float_macros.h"                      // LDBL_MANT_DIG
 #include "include/llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index daaf505008ca11..269bc6be5d8343 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2933,6 +2933,7 @@ add_entrypoint_object(
   HDRS
     ../scalbn.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2945,6 +2946,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2957,6 +2959,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnl.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2969,6 +2972,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf128.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
diff --git a/libc/src/math/generic/scalbn.cpp b/libc/src/math/generic/scalbn.cpp
index 3908f5892f144f..207cce1550bc01 100644
--- a/libc/src/math/generic/scalbn.cpp
+++ b/libc/src/math/generic/scalbn.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbn.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, scalbn, (double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf.cpp b/libc/src/math/generic/scalbnf.cpp
index 4a4fa86dcfd895..e478088d3ce5a5 100644
--- a/libc/src/math/generic/scalbnf.cpp
+++ b/libc/src/math/generic/scalbnf.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp
index be3d29ed27e985..5fd59611d53de7 100644
--- a/libc/src/math/generic/scalbnf128.cpp
+++ b/libc/src/math/generic/scalbnf128.cpp
@@ -7,21 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf128.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) {
-// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead
-// see: https://github.com/llvm/llvm-project/issues/90496
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnl.cpp b/libc/src/math/generic/scalbnl.cpp
index 681338ec01f078..1225a7ebaf572d 100644
--- a/libc/src/math/generic/scalbnl.cpp
+++ b/libc/src/math/generic/scalbnl.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnl.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long double, scalbnl, (long double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 446499cf15d7b4..70ec3a48a5e2e3 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -127,6 +127,11 @@ libc_support_library(
     hdrs = ["hdr/time_macros.h"],
 )
 
+libc_support_library(
+    name = "hdr_float_macros",
+    hdrs = ["hdr/float_macros.h"],
+)
+
 ############################ Type Proxy Header Files ###########################
 
 libc_support_library(
@@ -189,7 +194,7 @@ libc_support_library(
         ":__support_macros_properties_compiler",
         ":__support_macros_properties_cpu_features",
         ":__support_macros_properties_os",
-        ":llvm_libc_macros_float_macros",
+        ":hdr_float_macros",
         ":llvm_libc_types_float128",
     ],
 )

From 39e5036c0e22cea24df73d28746bb8fe0a117f9d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 May 2024 16:25:54 -0700
Subject: [PATCH 034/230] [SCEV] Add predicated version of
 getSymbolicMaxBackedgeTakenCount. (#93498)

This patch adds a predicated version of
getSymbolicMaxBackedgeTakenCount.

The intended use for this is loop access analysis for loops with
uncountable exits. When analyzing dependences and computing runtime
checks, we need the smallest upper bound on the number of iterations. In
terms of memory safety, it shouldn't matter if any uncomputable exits
leave the loop, as long as we prove that there are no dependences given
the minimum of the countable exits. The same should apply also for
generating runtime checks.

PR: https://github.com/llvm/llvm-project/pull/93498
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  | 19 +++++++-
 llvm/lib/Analysis/ScalarEvolution.cpp         | 48 +++++++++++++++++--
 ...cated-symbolic-max-backedge-taken-count.ll |  6 +++
 3 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 1d016b28347d27..72f3d945424963 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -912,6 +912,13 @@ class ScalarEvolution {
     return getBackedgeTakenCount(L, SymbolicMaximum);
   }
 
+  /// Similar to getSymbolicMaxBackedgeTakenCount, except it will add a set of
+  /// SCEV predicates to Predicates that are required to be true in order for
+  /// the answer to be correct. Predicates can be checked with run-time
+  /// checks and can be used to perform loop versioning.
+  const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount(
+      const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates);
+
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
@@ -1549,7 +1556,9 @@ class ScalarEvolution {
                                ScalarEvolution *SE) const;
 
     /// Get the symbolic max backedge taken count for the loop.
-    const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE);
+    const SCEV *
+    getSymbolicMax(const Loop *L, ScalarEvolution *SE,
+                   SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr);
 
     /// Get the symbolic max backedge taken count for the particular loop exit.
     const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock,
@@ -1746,7 +1755,7 @@ class ScalarEvolution {
 
   /// Similar to getBackedgeTakenInfo, but will add predicates as required
   /// with the purpose of returning complete information.
-  const BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
+  BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
 
   /// Compute the number of times the specified loop will iterate.
   /// If AllowPredicates is set, we will create new SCEV predicates as
@@ -2311,6 +2320,9 @@ class PredicatedScalarEvolution {
   /// Get the (predicated) backedge count for the analyzed loop.
   const SCEV *getBackedgeTakenCount();
 
+  /// Get the (predicated) symbolic max backedge count for the analyzed loop.
+  const SCEV *getSymbolicMaxBackedgeTakenCount();
+
   /// Adds a new predicate.
   void addPredicate(const SCEVPredicate &Pred);
 
@@ -2379,6 +2391,9 @@ class PredicatedScalarEvolution {
 
   /// The backedge taken count.
   const SCEV *BackedgeCount = nullptr;
+
+  /// The symbolic backedge taken count.
+  const SCEV *SymbolicMaxBackedgeCount = nullptr;
 };
 
 template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index bb56b41fe15d58..e46d7183a2a359 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8295,6 +8295,11 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
+const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount(
+    const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds) {
+  return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds);
+}
+
 bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
   return getBackedgeTakenInfo(L).isConstantMaxOrZero(this);
 }
@@ -8311,7 +8316,7 @@ static void PushLoopPHIs(const Loop *L,
       Worklist.push_back(&PN);
 }
 
-const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
   auto &BTI = getBackedgeTakenInfo(L);
   if (BTI.hasFullInfo())
@@ -8644,9 +8649,9 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
   return getConstantMax();
 }
 
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
-                                                   ScalarEvolution *SE) {
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<const SCEVPredicate *, 4> *Predicates) {
   if (!SymbolicMax) {
     // Form an expression for the maximum exit count possible for this loop. We
     // merge the max and exact information to approximate a version of
@@ -8661,6 +8666,12 @@ ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
                "We should only have known counts for exiting blocks that "
                "dominate latch!");
         ExitCounts.push_back(ExitCount);
+        if (Predicates)
+          for (const auto *P : ENT.Predicates)
+            Predicates->push_back(P);
+
+        assert((Predicates || ENT.hasAlwaysTruePredicate()) &&
+               "Predicate should be always true!");
       }
     }
     if (ExitCounts.empty())
@@ -13609,6 +13620,24 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
       P->print(OS, 4);
   }
 
+  Preds.clear();
+  auto *PredSymbolicMax =
+      SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds);
+  if (SymbolicBTC != PredSymbolicMax) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    if (!isa<SCEVCouldNotCompute>(PredSymbolicMax)) {
+      OS << "Predicated symbolic max backedge-taken count is ";
+      PrintSCEVWithTypeHint(OS, PredSymbolicMax);
+    } else
+      OS << "Unpredictable predicated symbolic max backedge-taken count.";
+    OS << "\n";
+    OS << " Predicates:\n";
+    for (const auto *P : Preds)
+      P->print(OS, 4);
+  }
+
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
     OS << "Loop ";
     L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
@@ -14822,6 +14851,17 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   return BackedgeCount;
 }
 
+const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
+  if (!SymbolicMaxBackedgeCount) {
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    SymbolicMaxBackedgeCount =
+        SE.getPredicatedSymbolicMaxBackedgeTakenCount(&L, Preds);
+    for (const auto *P : Preds)
+      addPredicate(*P);
+  }
+  return SymbolicMaxBackedgeCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds->implies(&Pred))
     return;
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
index d40416359b65c6..8dc79a54eb97a5 100644
--- a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -12,6 +12,9 @@ define void @test1(i64 %x, ptr %a, ptr %b) {
 ; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ;
 entry:
   br label %header
@@ -52,6 +55,9 @@ define void @test2(i64 %x, ptr %a) {
 ; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
 ;
 entry:
   br label %header

From 722a5fce589cea76a0baf89ce731477bae8cf4b8 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 16:27:04 -0700
Subject: [PATCH 035/230] [WebAssembly] Add -wasm-enable-exnref option (#93597)

This adds `-wasm-enable-exnref`, which will enable the new EH
instructions using `exnref` (adopted in Oct 2023 CG meeting):
https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md
This option should be used with `-wasm-enable-eh`.
---
 .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp   | 7 +++++++
 .../WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h     | 1 +
 llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp   | 4 ++++
 llvm/test/CodeGen/WebAssembly/eh-option-errors.ll          | 3 +++
 4 files changed, 15 insertions(+)

diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index e8f58a19d25e3b..71dfe1062956e3 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -54,6 +54,13 @@ cl::opt<bool>
 // setjmp/longjmp handling using wasm EH instrutions
 cl::opt<bool> WebAssembly::WasmEnableSjLj(
     "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling"));
+// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023.
+// Should be used with -wasm-enable-eh.
+// Currently set to false by default, but will later change to true and then
+// later can be removed after the legacy WAsm EH instructions are removed.
+cl::opt<bool> WebAssembly::WasmEnableExnref(
+    "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"),
+    cl::init(false));
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index b7498cb4299452..7f1a5f616ed484 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -44,6 +44,7 @@ extern cl::opt<bool> WasmEnableEmEH;   // asm.js-style EH
 extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ
 extern cl::opt<bool> WasmEnableEH;     // EH using Wasm EH instructions
 extern cl::opt<bool> WasmEnableSjLj;   // SjLj using Wasm EH instructions
+extern cl::opt<bool> WasmEnableExnref; // EH using new Wasm EH (exnref)
 
 enum OperandType {
   /// Basic block label in a branch construct.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 68126992ddcd72..fd92a35c2638a5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -385,6 +385,7 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 using WebAssembly::WasmEnableEH;
 using WebAssembly::WasmEnableEmEH;
 using WebAssembly::WasmEnableEmSjLj;
+using WebAssembly::WasmEnableExnref;
 using WebAssembly::WasmEnableSjLj;
 
 static void basicCheckForEHAndSjLj(TargetMachine *TM) {
@@ -401,6 +402,9 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
   if (WasmEnableEmEH && WasmEnableSjLj)
     report_fatal_error(
         "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
+  if (WasmEnableExnref && !WasmEnableEH)
+    report_fatal_error(
+        "-wasm-enable-exnref should be used with -wasm-enable-eh");
 
   // Here we make sure TargetOptions.ExceptionModel is the same as
   // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
index 74d02ddc405d3f..52a6364e122589 100644
--- a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
+++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
@@ -9,6 +9,9 @@ target triple = "wasm32-unknown-unknown"
 ; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ
 ; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj
 
+; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY
+; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh
+
 ; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF
 ; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm'
 

From 60bce6eab4d734b86f49b7638856eb8899bc89e8 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl@gmail.com>
Date: Tue, 28 May 2024 16:33:20 -0700
Subject: [PATCH 036/230] [WebAssembly] Implement all f16x8 binary
 instructions. (#93360)

This reuses most of the code that was created for f32x4 and f64x2 binary
instructions and tries to follow how they were implemented.

add/sub/mul/div - use regular LL instructions
min/max - use the minimum/maximum intrinsic, and also have builtins
pmin/pmax - use the wasm.pmax/pmin intrinsics and also have builtins

Specified at:

https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md
---
 .../clang/Basic/BuiltinsWebAssembly.def       |  4 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  4 ++
 clang/test/CodeGen/builtins-wasm.c            | 24 +++++++
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  5 ++
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 43 +++++++++---
 .../CodeGen/WebAssembly/half-precision.ll     | 68 +++++++++++++++++++
 llvm/test/MC/WebAssembly/simd-encodings.s     | 24 +++++++
 7 files changed, 163 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index fd8c1b480d6da0..4e48ff48b60f5f 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -135,6 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision")
 
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5edf8c79709131..a3c65105033247 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20806,6 +20806,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_min_f32:
   case WebAssembly::BI__builtin_wasm_min_f64:
+  case WebAssembly::BI__builtin_wasm_min_f16x8:
   case WebAssembly::BI__builtin_wasm_min_f32x4:
   case WebAssembly::BI__builtin_wasm_min_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20816,6 +20817,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_max_f32:
   case WebAssembly::BI__builtin_wasm_max_f64:
+  case WebAssembly::BI__builtin_wasm_max_f16x8:
   case WebAssembly::BI__builtin_wasm_max_f32x4:
   case WebAssembly::BI__builtin_wasm_max_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20824,6 +20826,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmin_f16x8:
   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20832,6 +20835,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmax_f16x8:
   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 93a6ab06081c99..d6ee4f68700dca 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -825,6 +825,30 @@ float extract_lane_f16x8(f16x8 a, int i) {
   // WEBASSEMBLY-NEXT: ret float %0
   return __builtin_wasm_extract_lane_f16x8(a, i);
 }
+
+f16x8 min_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_min_f16x8(a, b);
+}
+
+f16x8 max_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.maximum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_max_f16x8(a, b);
+}
+
+f16x8 pmin_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmin_f16x8(a, b);
+}
+
+f16x8 pmax_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmax_f16x8(a, b);
+}
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f9f16498bb390c..4beab9d091581b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -145,6 +145,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  if (Subtarget->hasHalfPrecision()) {
+    setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
+  }
+
   // Expand unavailable integer operations.
   for (auto Op :
        {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 558e3d859dcd84..baf15ccdbe9edb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,33 +16,34 @@
 multiclass ABSTRACT_SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                            list<dag> pattern_r, string asmstr_r,
                            string asmstr_s, bits<32> simdop,
-                           Predicate simd_level> {
+                           list<Predicate> reqs> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !if(!ge(simdop, 0x100),
                   !or(0xfd0000, !and(0xffff, simdop)),
                   !or(0xfd00, !and(0xff, simdop)))>,
-            Requires<[simd_level]>;
+            Requires<reqs>;
 }
 
 multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
-                  string asmstr_s = "", bits<32> simdop = -1> {
+                  string asmstr_s = "", bits<32> simdop = -1,
+                  list<Predicate> reqs = []> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasSIMD128>;
+                            asmstr_s, simdop, !listconcat([HasSIMD128], reqs)>;
 }
 
 multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                      list<dag> pattern_r, string asmstr_r = "",
                      string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasRelaxedSIMD>;
+                            asmstr_s, simdop, [HasRelaxedSIMD]>;
 }
 
 multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                             list<dag> pattern_r, string asmstr_r = "",
                             string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasHalfPrecision>;
+                            asmstr_s, simdop, [HasHalfPrecision]>;
 }
 
 
@@ -152,6 +153,19 @@ def F64x2 : Vec {
   let prefix = "f64x2";
 }
 
+def F16x8 : Vec {
+ let vt = v8f16;
+ let int_vt = v8i16;
+ let lane_vt = f32;
+ let lane_rc = F32;
+ let lane_bits = 16;
+ let lane_idx = LaneIdx8;
+ let lane_load = int_wasm_loadf16_f32;
+ let splat = PatFrag<(ops node:$x), (v8f16 (splat_vector (f16 $x)))>;
+ let prefix = "f16x8";
+}
+
+// TODO: Include F16x8 here when half precision is better supported.
 defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
 defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
 
@@ -781,13 +795,19 @@ def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
 // Bitwise operations
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name, bits<32> simdop> {
+multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name,
+                      bits<32> simdop, list<Predicate> reqs = []> {
   defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
                       (outs), (ins),
                       [(set (vec.vt V128:$dst),
                         (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
                       vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
-                      vec.prefix#"."#name, simdop>;
+                      vec.prefix#"."#name, simdop, reqs>;
+}
+
+multiclass HalfPrecisionBinary<Vec vec, SDPatternOperator node, string name,
+                               bits<32> simdop> {
+  defm "" : SIMDBinary<vec, node, name, simdop, [HasHalfPrecision]>;
 }
 
 multiclass SIMDBitwise<SDPatternOperator node, string name, bits<32> simdop,
@@ -1199,6 +1219,7 @@ def : Pat<(v2f64 (froundeven (v2f64 V128:$src))), (NEAREST_F64x2 V128:$src)>;
 multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<F32x4, node, name, baseInst>;
   defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
+  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 80)>;
 }
 
 // Addition: add
@@ -1242,7 +1263,7 @@ defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
 // Also match the pmin/pmax cases where the operands are int vectors (but the
 // comparison is still a floating point comparison). This can happen when using
 // the wasm_simd128.h intrinsics because v128_t is an integer vector.
-foreach vec = [F32x4, F64x2] in {
+foreach vec = [F32x4, F64x2, F16x8] in {
 defvar pmin = !cast<NI>("PMIN_"#vec);
 defvar pmax = !cast<NI>("PMAX_"#vec);
 def : Pat<(vec.int_vt (vselect
@@ -1266,6 +1287,10 @@ def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMIN_F64x2 V128:$lhs, V128:$rhs)>;
 def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMAX_F64x2 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmin (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMIN_F16x8 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMAX_F16x8 V128:$lhs, V128:$rhs)>;
 
 //===----------------------------------------------------------------------===//
 // Conversions
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index d9d3f6be800fdd..73ccea8d652db8 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -35,3 +35,71 @@ define float @extract_lane_v8f16(<8 x half> %v) {
   %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1)
   ret float %r
 }
+
+; CHECK-LABEL: add_v8f16:
+; CHECK:       f16x8.add $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @add_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fadd <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: sub_v8f16:
+; CHECK:       f16x8.sub $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @sub_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fsub <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: mul_v8f16:
+; CHECK:       f16x8.mul $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @mul_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fmul <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: div_v8f16:
+; CHECK:       f16x8.div $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @div_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fdiv <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: min_intrinsic_v8f16:
+; CHECK:       f16x8.min $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @min_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v8f16:
+; CHECK:       f16x8.max $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @max_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: pmin_intrinsic_v8f16:
+; CHECK:       f16x8.pmin $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmin.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmin_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
+
+; CHECK-LABEL: pmax_intrinsic_v8f16:
+; CHECK:       f16x8.pmax $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmax.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmax_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index d397188a9882ea..113a23da776fa9 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -851,4 +851,28 @@ main:
     # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
     f16x8.extract_lane 1
 
+    # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
+    f16x8.add
+
+    # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02]
+    f16x8.sub
+
+    # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02]
+    f16x8.mul
+
+    # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02]
+    f16x8.div
+
+    # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02]
+    f16x8.min
+
+    # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02]
+    f16x8.max
+
+    # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02]
+    f16x8.pmin
+
+    # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02]
+    f16x8.pmax
+
     end_function

From 0edc97f119f3ac3ff96b11183fe5c001a48a9a8d Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Tue, 28 May 2024 16:39:09 -0700
Subject: [PATCH 037/230] [IR][AArch64][PAC] Add "ptrauth(...)" Constant to
 represent signed pointers. (#85738)

This defines a new kind of IR Constant that represents a ptrauth signed
pointer, as used in AArch64 PAuth.

It allows representing most kinds of signed pointer constants used thus
far in the llvm ptrauth implementations, notably those used in the
Darwin and ELF ABIs being implemented for c/c++.  These signed pointer
constants are then lowered to ELF/MachO relocations.

These can be simply thought of as a constant `llvm.ptrauth.sign`, with
the interesting addition of discriminator computation: the `ptrauth`
constant can also represent a combined blend, when both address and
integer discriminator operands are used.  Both operands are otherwise
optional, with default values 0/null.
---
 llvm/docs/LangRef.rst                         |  34 +++++
 llvm/docs/PointerAuth.md                      |  22 ++++
 llvm/include/llvm/AsmParser/LLToken.h         |   1 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |   1 +
 llvm/include/llvm/IR/Constants.h              |  66 ++++++++++
 llvm/include/llvm/IR/Value.def                |   1 +
 llvm/lib/Analysis/ValueTracking.cpp           |   4 +
 llvm/lib/AsmParser/LLLexer.cpp                |   1 +
 llvm/lib/AsmParser/LLParser.cpp               |  54 ++++++++
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |   1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  25 +++-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   6 +
 llvm/lib/IR/AsmWriter.cpp                     |  21 +++
 llvm/lib/IR/Constants.cpp                     | 121 ++++++++++++++++++
 llvm/lib/IR/ConstantsContext.h                |  47 +++++++
 llvm/lib/IR/LLVMContextImpl.h                 |   2 +
 llvm/lib/IR/Verifier.cpp                      |  23 ++++
 llvm/test/Assembler/invalid-ptrauth-const1.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const2.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const3.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const4.ll |   6 +
 llvm/test/Assembler/invalid-ptrauth-const5.ll |   6 +
 llvm/test/Assembler/ptrauth-const.ll          |  24 ++++
 llvm/test/Bitcode/compatibility.ll            |   4 +
 llvm/utils/vim/syntax/llvm.vim                |   1 +
 25 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const1.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const2.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const3.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const4.ll
 create mode 100644 llvm/test/Assembler/invalid-ptrauth-const5.ll
 create mode 100644 llvm/test/Assembler/ptrauth-const.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 614dd98b013b35..7b64c477d13c7f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4754,6 +4754,40 @@ reference to the CFI jump table in the ``LowerTypeTests`` pass. These constants
 may be useful in low-level programs, such as operating system kernels, which
 need to refer to the actual function body.
 
+.. _ptrauth_constant:
+
+Pointer Authentication Constants
+--------------------------------
+
+``ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)``
+
+A '``ptrauth``' constant represents a pointer with a cryptographic
+authentication signature embedded into some bits, as described in the
+`Pointer Authentication <PointerAuth.html>`__ document.
+
+A '``ptrauth``' constant is simply a constant equivalent to the
+``llvm.ptrauth.sign`` intrinsic, potentially fed by a discriminator
+``llvm.ptrauth.blend`` if needed.
+
+Its type is the same as the first argument.  An integer constant discriminator
+and an address discriminator may be optionally specified.  Otherwise, they have
+values ``i64 0`` and ``ptr null``.
+
+If the address discriminator is ``null`` then the expression is equivalent to
+
+.. code-block:: llvm
+
+    %tmp = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 DISC)
+    %val = inttoptr i64 %tmp to ptr
+
+Otherwise, the expression is equivalent to:
+
+.. code-block:: llvm
+
+    %tmp1 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr ADDRDISC to i64), i64 DISC)
+    %tmp2 = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 %tmp1)
+    %val = inttoptr i64 %tmp2 to ptr
+
 .. _constantexprs:
 
 Constant Expressions
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index a8d2b4d8f5f0bd..cf2cc6305f130f 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -16,6 +16,7 @@ For more details, see the clang documentation page for
 At the IR level, it is represented using:
 
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
+* a [signed pointer constant](#constant) (to sign globals)
 * a [call operand bundle](#operand-bundle) (to authenticate called pointers)
 
 The current implementation leverages the
@@ -225,6 +226,27 @@ with a pointer address discriminator, in a way that is specified by the target
 implementation.
 
 
+### Constant
+
+[Intrinsics](#intrinsics) can be used to produce signed pointers dynamically,
+in code, but not for signed pointers referenced by constants, in, e.g., global
+initializers.
+
+The latter are represented using a
+[``ptrauth`` constant](https://llvm.org/docs/LangRef.html#ptrauth-constant),
+which describes an authenticated relocation producing a signed pointer.
+
+```llvm
+ptrauth (ptr CST, i32 KEY, i64 DISC, ptr ADDRDISC)
+```
+
+is equivalent to:
+
+```llvm
+  %disc = call i64 @llvm.ptrauth.blend(i64 ptrtoint(ptr ADDRDISC to i64), i64 DISC)
+  %signedval = call i64 @llvm.ptrauth.sign(ptr CST, i32 KEY, i64 %disc)
+```
+
 ### Operand Bundle
 
 Function pointers used as indirect call targets can be signed when materialized,
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index df61ec6ed30e0b..69821c22dcd619 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -346,6 +346,7 @@ enum Kind {
   kw_blockaddress,
   kw_dso_local_equivalent,
   kw_no_cfi,
+  kw_ptrauth,
 
   kw_freeze,
 
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index d3b9e96520f88a..9999aee61528e5 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -413,6 +413,7 @@ enum ConstantsCodes {
                                       //                 asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE = 31,  // [opty, flags, range, n x operands]
   CST_CODE_CE_GEP = 32,               // [opty, flags, n x operands]
+  CST_CODE_PTRAUTH = 33,              // [ptr, key, disc, addrdisc]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index a1e5005a9d1da5..86f6be7985a23f 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1008,6 +1008,72 @@ struct OperandTraits<NoCFIValue> : public FixedNumOperandTraits<NoCFIValue, 1> {
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(NoCFIValue, Value)
 
+/// A signed pointer, in the ptrauth sense.
+class ConstantPtrAuth final : public Constant {
+  friend struct ConstantPtrAuthKeyType;
+  friend class Constant;
+
+  ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc,
+                  Constant *AddrDisc);
+
+  void *operator new(size_t s) { return User::operator new(s, 4); }
+
+  void destroyConstantImpl();
+  Value *handleOperandChangeImpl(Value *From, Value *To);
+
+public:
+  /// Return a pointer signed with the specified parameters.
+  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                              ConstantInt *Disc, Constant *AddrDisc);
+
+  /// Produce a new ptrauth expression signing the given value using
+  /// the same schema as is stored in one.
+  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
+
+  /// The pointer that is signed in this ptrauth signed pointer.
+  Constant *getPointer() const { return cast<Constant>(Op<0>().get()); }
+
+  /// The Key ID, an i32 constant.
+  ConstantInt *getKey() const { return cast<ConstantInt>(Op<1>().get()); }
+
+  /// The integer discriminator, an i64 constant, or 0.
+  ConstantInt *getDiscriminator() const {
+    return cast<ConstantInt>(Op<2>().get());
+  }
+
+  /// The address discriminator if any, or the null constant.
+  /// If present, this must be a value equivalent to the storage location of
+  /// the only global-initializer user of the ptrauth signed pointer.
+  Constant *getAddrDiscriminator() const {
+    return cast<Constant>(Op<3>().get());
+  }
+
+  /// Whether there is any non-null address discriminator.
+  bool hasAddressDiscriminator() const {
+    return !getAddrDiscriminator()->isNullValue();
+  }
+
+  /// Check whether an authentication operation with key \p Key and (possibly
+  /// blended) discriminator \p Discriminator is known to be compatible with
+  /// this ptrauth signed pointer.
+  bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator,
+                             const DataLayout &DL) const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *V) {
+    return V->getValueID() == ConstantPtrAuthVal;
+  }
+};
+
+template <>
+struct OperandTraits<ConstantPtrAuth>
+    : public FixedNumOperandTraits<ConstantPtrAuth, 4> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPtrAuth, Constant)
+
 //===----------------------------------------------------------------------===//
 /// A constant value that is initialized with an expression using
 /// other constant values.
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index 61f7a87666d094..3ece66a529e125 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -81,6 +81,7 @@ HANDLE_CONSTANT(BlockAddress)
 HANDLE_CONSTANT(ConstantExpr)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(NoCFIValue)
+HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ConstantPtrAuth)
 
 // ConstantAggregate.
 HANDLE_CONSTANT(ConstantArray)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 3baa8ede28ffaf..08138a5e2f2d9d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3140,6 +3140,10 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
       return true;
     }
 
+    // Constant ptrauth can be null, iff the base pointer can be.
+    if (auto *CPA = dyn_cast<ConstantPtrAuth>(V))
+      return isKnownNonZero(CPA->getPointer(), DemandedElts, Q, Depth);
+
     // A global variable in address space 0 is non null unless extern weak
     // or an absolute symbol reference. Other address spaces may have null as a
     // valid address for a global, so we can't assume anything.
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 20a1bd29577124..d3ab306904da12 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -710,6 +710,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(blockaddress);
   KEYWORD(dso_local_equivalent);
   KEYWORD(no_cfi);
+  KEYWORD(ptrauth);
 
   // Metadata types.
   KEYWORD(distinct);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5d2056d2085672..df0827996396ef 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4046,6 +4046,60 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     ID.NoCFI = true;
     return false;
   }
+  case lltok::kw_ptrauth: {
+    // ValID ::= 'ptrauth' '(' ptr @foo ',' i32 <key>
+    //                         (',' i64 <disc> (',' ptr addrdisc)? )? ')'
+    Lex.Lex();
+
+    Constant *Ptr, *Key;
+    Constant *Disc = nullptr, *AddrDisc = nullptr;
+
+    if (parseToken(lltok::lparen,
+                   "expected '(' in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Ptr) ||
+        parseToken(lltok::comma,
+                   "expected comma in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Key))
+      return true;
+    // If present, parse the optional disc/addrdisc.
+    if (EatIfPresent(lltok::comma))
+      if (parseGlobalTypeAndValue(Disc) ||
+          (EatIfPresent(lltok::comma) && parseGlobalTypeAndValue(AddrDisc)))
+        return true;
+    if (parseToken(lltok::rparen,
+                   "expected ')' in constant ptrauth expression"))
+      return true;
+
+    if (!Ptr->getType()->isPointerTy())
+      return error(ID.Loc, "constant ptrauth base pointer must be a pointer");
+
+    auto *KeyC = dyn_cast<ConstantInt>(Key);
+    if (!KeyC || KeyC->getBitWidth() != 32)
+      return error(ID.Loc, "constant ptrauth key must be i32 constant");
+
+    ConstantInt *DiscC = nullptr;
+    if (Disc) {
+      DiscC = dyn_cast<ConstantInt>(Disc);
+      if (!DiscC || DiscC->getBitWidth() != 64)
+        return error(
+            ID.Loc,
+            "constant ptrauth integer discriminator must be i64 constant");
+    } else {
+      DiscC = ConstantInt::get(Type::getInt64Ty(Context), 0);
+    }
+
+    if (AddrDisc) {
+      if (!AddrDisc->getType()->isPointerTy())
+        return error(
+            ID.Loc, "constant ptrauth address discriminator must be a pointer");
+    } else {
+      AddrDisc = ConstantPointerNull::get(PointerType::get(Context, 0));
+    }
+
+    ID.ConstantVal = ConstantPtrAuth::get(Ptr, KeyC, DiscC, AddrDisc);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
 
   case lltok::kw_trunc:
   case lltok::kw_bitcast:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index c085c715179ba6..b7ed9cdf631454 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -222,6 +222,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(CST_CODE, CE_UNOP)
       STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT)
       STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE)
+      STRINGIFY_CODE(CST_CODE, PTRAUTH)
     case bitc::CST_CODE_BLOCKADDRESS:
       return "CST_CODE_BLOCKADDRESS";
       STRINGIFY_CODE(CST_CODE, DATA)
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 32b9a033173e93..aee627bbde0bf5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -517,7 +517,8 @@ class BitcodeConstant final : public Value,
   static constexpr uint8_t NoCFIOpcode = 252;
   static constexpr uint8_t DSOLocalEquivalentOpcode = 251;
   static constexpr uint8_t BlockAddressOpcode = 250;
-  static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode;
+  static constexpr uint8_t ConstantPtrAuthOpcode = 249;
+  static constexpr uint8_t FirstSpecialOpcode = ConstantPtrAuthOpcode;
 
   // Separate struct to make passing different number of parameters to
   // BitcodeConstant::create() more convenient.
@@ -1562,6 +1563,18 @@ Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
         C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags);
       } else {
         switch (BC->Opcode) {
+        case BitcodeConstant::ConstantPtrAuthOpcode: {
+          auto *Key = dyn_cast<ConstantInt>(ConstOps[1]);
+          if (!Key)
+            return error("ptrauth key operand must be ConstantInt");
+
+          auto *Disc = dyn_cast<ConstantInt>(ConstOps[2]);
+          if (!Disc)
+            return error("ptrauth disc operand must be ConstantInt");
+
+          C = ConstantPtrAuth::get(ConstOps[0], Key, Disc, ConstOps[3]);
+          break;
+        }
         case BitcodeConstant::NoCFIOpcode: {
           auto *GV = dyn_cast<GlobalValue>(ConstOps[0]);
           if (!GV)
@@ -3644,6 +3657,16 @@ Error BitcodeReader::parseConstants() {
                                   Record[1]);
       break;
     }
+    case bitc::CST_CODE_PTRAUTH: {
+      if (Record.size() < 4)
+        return error("Invalid ptrauth record");
+      // Ptr, Key, Disc, AddrDisc
+      V = BitcodeConstant::create(Alloc, CurTy,
+                                  BitcodeConstant::ConstantPtrAuthOpcode,
+                                  {(unsigned)Record[0], (unsigned)Record[1],
+                                   (unsigned)Record[2], (unsigned)Record[3]});
+      break;
+    }
     }
 
     assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3d653fe4458f4b..046dad5721c4ce 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2848,6 +2848,12 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       Code = bitc::CST_CODE_NO_CFI_VALUE;
       Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType()));
       Record.push_back(VE.getValueID(NC->getGlobalValue()));
+    } else if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C)) {
+      Code = bitc::CST_CODE_PTRAUTH;
+      Record.push_back(VE.getValueID(CPA->getPointer()));
+      Record.push_back(VE.getValueID(CPA->getKey()));
+      Record.push_back(VE.getValueID(CPA->getDiscriminator()));
+      Record.push_back(VE.getValueID(CPA->getAddrDiscriminator()));
     } else {
 #ifndef NDEBUG
       C->dump();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ced5d78f994ab5..8b1a21f962b08f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1594,6 +1594,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV)) {
+    Out << "ptrauth (";
+
+    // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)
+    unsigned NumOpsToWrite = 2;
+    if (!CPA->getOperand(2)->isNullValue())
+      NumOpsToWrite = 3;
+    if (!CPA->getOperand(3)->isNullValue())
+      NumOpsToWrite = 4;
+
+    ListSeparator LS;
+    for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) {
+      Out << LS;
+      WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx);
+    }
+    Out << ')';
+    return;
+  }
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cfb89d557db479..119fcb4fa03461 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -550,6 +550,9 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::NoCFIValueVal:
     delete static_cast<NoCFIValue *>(C);
     break;
+  case Constant::ConstantPtrAuthVal:
+    delete static_cast<ConstantPtrAuth *>(C);
+    break;
   case Constant::UndefValueVal:
     delete static_cast<UndefValue *>(C);
     break;
@@ -2015,6 +2018,124 @@ Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) {
   return nullptr;
 }
 
+//---- ConstantPtrAuth::get() implementations.
+//
+
+ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key,
+                                      ConstantInt *Disc, Constant *AddrDisc) {
+  Constant *ArgVec[] = {Ptr, Key, Disc, AddrDisc};
+  ConstantPtrAuthKeyType MapKey(ArgVec);
+  LLVMContextImpl *pImpl = Ptr->getContext().pImpl;
+  return pImpl->ConstantPtrAuths.getOrCreate(Ptr->getType(), MapKey);
+}
+
+ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
+  return get(Pointer, getKey(), getDiscriminator(), getAddrDiscriminator());
+}
+
+ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key,
+                                 ConstantInt *Disc, Constant *AddrDisc)
+    : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) {
+  assert(Ptr->getType()->isPointerTy());
+  assert(Key->getBitWidth() == 32);
+  assert(Disc->getBitWidth() == 64);
+  assert(AddrDisc->getType()->isPointerTy());
+  setOperand(0, Ptr);
+  setOperand(1, Key);
+  setOperand(2, Disc);
+  setOperand(3, AddrDisc);
+}
+
+/// Remove the constant from the constant table.
+void ConstantPtrAuth::destroyConstantImpl() {
+  getType()->getContext().pImpl->ConstantPtrAuths.remove(this);
+}
+
+Value *ConstantPtrAuth::handleOperandChangeImpl(Value *From, Value *ToV) {
+  assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
+  Constant *To = cast<Constant>(ToV);
+
+  SmallVector<Constant *, 4> Values;
+  Values.reserve(getNumOperands());
+
+  unsigned NumUpdated = 0;
+
+  Use *OperandList = getOperandList();
+  unsigned OperandNo = 0;
+  for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) {
+    Constant *Val = cast<Constant>(O->get());
+    if (Val == From) {
+      OperandNo = (O - OperandList);
+      Val = To;
+      ++NumUpdated;
+    }
+    Values.push_back(Val);
+  }
+
+  return getContext().pImpl->ConstantPtrAuths.replaceOperandsInPlace(
+      Values, this, From, To, NumUpdated, OperandNo);
+}
+
+bool ConstantPtrAuth::isKnownCompatibleWith(const Value *Key,
+                                            const Value *Discriminator,
+                                            const DataLayout &DL) const {
+  // If the keys are different, there's no chance for this to be compatible.
+  if (getKey() != Key)
+    return false;
+
+  // We can have 3 kinds of discriminators:
+  // - simple, integer-only:    `i64 x, ptr null` vs. `i64 x`
+  // - address-only:            `i64 0, ptr p` vs. `ptr p`
+  // - blended address/integer: `i64 x, ptr p` vs. `@llvm.ptrauth.blend(p, x)`
+
+  // If this constant has a simple discriminator (integer, no address), easy:
+  // it's compatible iff the provided full discriminator is also a simple
+  // discriminator, identical to our integer discriminator.
+  if (!hasAddressDiscriminator())
+    return getDiscriminator() == Discriminator;
+
+  // Otherwise, we can isolate address and integer discriminator components.
+  const Value *AddrDiscriminator = nullptr;
+
+  // This constant may or may not have an integer discriminator (instead of 0).
+  if (!getDiscriminator()->isNullValue()) {
+    // If it does, there's an implicit blend.  We need to have a matching blend
+    // intrinsic in the provided full discriminator.
+    if (!match(Discriminator,
+               m_Intrinsic<Intrinsic::ptrauth_blend>(
+                   m_Value(AddrDiscriminator), m_Specific(getDiscriminator()))))
+      return false;
+  } else {
+    // Otherwise, interpret the provided full discriminator as address-only.
+    AddrDiscriminator = Discriminator;
+  }
+
+  // Either way, we can now focus on comparing the address discriminators.
+
+  // Discriminators are i64, so the provided addr disc may be a ptrtoint.
+  if (auto *Cast = dyn_cast<PtrToIntOperator>(AddrDiscriminator))
+    AddrDiscriminator = Cast->getPointerOperand();
+
+  // Beyond that, we're only interested in compatible pointers.
+  if (getAddrDiscriminator()->getType() != AddrDiscriminator->getType())
+    return false;
+
+  // These are often the same constant GEP, making them trivially equivalent.
+  if (getAddrDiscriminator() == AddrDiscriminator)
+    return true;
+
+  // Finally, they may be equivalent base+offset expressions.
+  APInt Off1(DL.getIndexTypeSizeInBits(getAddrDiscriminator()->getType()), 0);
+  auto *Base1 = getAddrDiscriminator()->stripAndAccumulateConstantOffsets(
+      DL, Off1, /*AllowNonInbounds=*/true);
+
+  APInt Off2(DL.getIndexTypeSizeInBits(AddrDiscriminator->getType()), 0);
+  auto *Base2 = AddrDiscriminator->stripAndAccumulateConstantOffsets(
+      DL, Off2, /*AllowNonInbounds=*/true);
+
+  return Base1 == Base2 && Off1 == Off2;
+}
+
 //---- ConstantExpr::get() implementations.
 //
 
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 7067d0d121117b..5153880b5cab64 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -286,6 +287,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
 template <class ConstantClass> struct ConstantAggrKeyType;
 struct InlineAsmKeyType;
 struct ConstantExprKeyType;
+struct ConstantPtrAuthKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
@@ -308,6 +310,10 @@ template <> struct ConstantInfo<ConstantVector> {
   using ValType = ConstantAggrKeyType<ConstantVector>;
   using TypeClass = VectorType;
 };
+template <> struct ConstantInfo<ConstantPtrAuth> {
+  using ValType = ConstantPtrAuthKeyType;
+  using TypeClass = Type;
+};
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
@@ -536,6 +542,47 @@ struct ConstantExprKeyType {
   }
 };
 
+struct ConstantPtrAuthKeyType {
+  ArrayRef<Constant *> Operands;
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands, const ConstantPtrAuth *)
+      : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(const ConstantPtrAuth *C,
+                         SmallVectorImpl<Constant *> &Storage) {
+    assert(Storage.empty() && "Expected empty storage");
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      Storage.push_back(cast<Constant>(C->getOperand(I)));
+    Operands = Storage;
+  }
+
+  bool operator==(const ConstantPtrAuthKeyType &X) const {
+    return Operands == X.Operands;
+  }
+
+  bool operator==(const ConstantPtrAuth *C) const {
+    if (Operands.size() != C->getNumOperands())
+      return false;
+    for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+      if (Operands[I] != C->getOperand(I))
+        return false;
+    return true;
+  }
+
+  unsigned getHash() const {
+    return hash_combine_range(Operands.begin(), Operands.end());
+  }
+
+  using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass;
+
+  ConstantPtrAuth *create(TypeClass *Ty) const {
+    return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]),
+                               cast<ConstantInt>(Operands[2]), Operands[3]);
+  }
+};
+
 // Free memory for a given constant.  Assumes the constant has already been
 // removed from all relevant maps.
 void deleteConstant(Constant *C);
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 399fe0dad26c73..392e0d16f1761e 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1562,6 +1562,8 @@ class LLVMContextImpl {
 
   DenseMap<const GlobalValue *, NoCFIValue *> NoCFIValues;
 
+  ConstantUniqueMap<ConstantPtrAuth> ConstantPtrAuths;
+
   ConstantUniqueMap<ConstantExpr> ExprConstants;
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 50f8d6ec842017..684e54444621b5 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -629,6 +629,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
+  void visitConstantPtrAuth(const ConstantPtrAuth *CPA);
   void verifyInlineAsmCall(const CallBase &Call);
   void verifyStatepoint(const CallBase &Call);
   void verifyFrameRecoverIndices();
@@ -2422,6 +2423,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
     if (const auto *CE = dyn_cast<ConstantExpr>(C))
       visitConstantExpr(CE);
 
+    if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C))
+      visitConstantPtrAuth(CPA);
+
     if (const auto *GV = dyn_cast<GlobalValue>(C)) {
       // Global Values get visited separately, but we do need to make sure
       // that the global value is in the correct module
@@ -2449,6 +2453,23 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
           "Invalid bitcast", CE);
 }
 
+void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
+  Check(CPA->getPointer()->getType()->isPointerTy(),
+        "signed ptrauth constant base pointer must have pointer type");
+
+  Check(CPA->getType() == CPA->getPointer()->getType(),
+        "signed ptrauth constant must have same type as its base pointer");
+
+  Check(CPA->getKey()->getBitWidth() == 32,
+        "signed ptrauth constant key must be i32 constant integer");
+
+  Check(CPA->getAddrDiscriminator()->getType()->isPointerTy(),
+        "signed ptrauth constant address discriminator must be a pointer");
+
+  Check(CPA->getDiscriminator()->getBitWidth() == 64,
+        "signed ptrauth constant discriminator must be i64 constant integer");
+}
+
 bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   // There shouldn't be more attribute sets than there are parameters plus the
   // function and return value.
@@ -5090,6 +5111,8 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (isa<InlineAsm>(I.getOperand(i))) {
       Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
             "Cannot take the address of an inline asm!", &I);
+    } else if (auto *CPA = dyn_cast<ConstantPtrAuth>(I.getOperand(i))) {
+      visitConstantExprsRecursively(CPA);
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
diff --git a/llvm/test/Assembler/invalid-ptrauth-const1.ll b/llvm/test/Assembler/invalid-ptrauth-const1.ll
new file mode 100644
index 00000000000000..fba2e230782382
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const1.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth base pointer must be a pointer
+@auth_var = global ptr ptrauth (i32 42, i32 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const2.ll b/llvm/test/Assembler/invalid-ptrauth-const2.ll
new file mode 100644
index 00000000000000..4499c42601c99e
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth key must be i32 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 ptrtoint (ptr @var to i32))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const3.ll b/llvm/test/Assembler/invalid-ptrauth-const3.ll
new file mode 100644
index 00000000000000..3f2688d92a0010
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const3.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth address discriminator must be a pointer
+@auth_var = global ptr ptrauth (ptr @var, i32 2, i64 65535, i8 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const4.ll b/llvm/test/Assembler/invalid-ptrauth-const4.ll
new file mode 100644
index 00000000000000..843a220458a61b
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const4.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr null, i64 ptrtoint (ptr @var to i64))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const5.ll b/llvm/test/Assembler/invalid-ptrauth-const5.ll
new file mode 100644
index 00000000000000..9b47f6f5f423fc
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const5.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr @var))
diff --git a/llvm/test/Assembler/ptrauth-const.ll b/llvm/test/Assembler/ptrauth-const.ll
new file mode 100644
index 00000000000000..94d35146d5927b
--- /dev/null
+++ b/llvm/test/Assembler/ptrauth-const.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: @basic = global ptr ptrauth (ptr @var, i32 0)
+@basic = global ptr ptrauth (ptr @var, i32 0)
+
+; CHECK: @keyed = global ptr ptrauth (ptr @var, i32 3)
+@keyed = global ptr ptrauth (ptr @var, i32 3)
+
+; CHECK: @intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+@intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+
+; CHECK: @addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+@addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+
+
+@var1 = addrspace(1) global i32 0
+
+; CHECK: @addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+@addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+
+; CHECK: @addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
+@addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index b374924516d665..2a846e036924c7 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -217,6 +217,10 @@ declare void @g.f1()
 ; CHECK: @g.sanitize_address_dyninit = global i32 0, sanitize_address_dyninit
 ; CHECK: @g.sanitize_multiple = global i32 0, sanitize_memtag, sanitize_address_dyninit
 
+; ptrauth constant
+@auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535, ptr null)
+; CHECK: @auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535)
+
 ;; Aliases
 ; Format: @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
 ;                   [unnamed_addr] alias <AliaseeTy> @<Aliasee>
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index d86e3d1ddbc27f..905d696400ca37 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -150,6 +150,7 @@ syn keyword llvmKeyword
       \ preallocated
       \ private
       \ protected
+      \ ptrauth
       \ ptx_device
       \ ptx_kernel
       \ readnone

From 6f529aaf666624c26715aa348955b26a684d1250 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 28 May 2024 23:37:40 +0000
Subject: [PATCH 038/230] [WebAssembly] Remove IIT_EXNREF

This was added in #93586 but caused a compilation warning and is not
used anyway.
---
 llvm/include/llvm/IR/Intrinsics.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c3ac53837444ef..107442623ab7bd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -316,7 +316,6 @@ def IIT_PPCF128 : IIT_VT<ppcf128, 52>;
 def IIT_V3 : IIT_Vec<3, 53>;
 def IIT_EXTERNREF : IIT_VT<externref, 54>;
 def IIT_FUNCREF : IIT_VT<funcref, 55>;
-def IIT_EXNREF: IIT_VT<exnref, 56>;
 def IIT_I2 : IIT_Int<2, 57>;
 def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;

From bd5cd4b837b67f8d549f072f37dd09295b4bf9f7 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 20:01:47 -0400
Subject: [PATCH 039/230] Fix trigger for libc++ job rerunner.

Testing github actions is such a pain. I swear it should match now.
---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index 5682b0a4f52c3d..88924fb3cd7791 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -11,18 +11,16 @@ name: Restart Preempted Libc++ Workflow
 
 on:
   workflow_run:
-    workflows:
-      - Build and Test libc\+\+
+    workflows: [Build and Test libc\+\+]
     types:
-      - failure
-      - canceled
+      - completed
 
 permissions:
   contents: read
 
 jobs:
   restart:
-    if: github.repository_owner == 'llvm'
+    if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled')
     name: "Restart Job"
     permissions:
       statuses: read

From 5bfe4b93e15ad38f211c5dec64be0eeaa4c8e914 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 28 May 2024 20:04:41 -0400
Subject: [PATCH 040/230] [mlir][arith] Disallow casting tensor dimensions
 (#93349)

Tighten the verifier for arith cast ops to disallow changing tensor
dimensions, e.g., static to dynamic. After this change:
* `arith.cast_op %x : tensor<4xi32> to tensor<4xf32>` remains valid
* `arith.cast_op %x : tensor<4xi32> to tensor<?xf32>` becomes invalid
* `arith.cast_op %x : tensor<?xi32> to tensor<4xf32>` becomes invalid

This is mostly to simplify the op semantics. See the discussion thread
for more context:
https://discourse.llvm.org/t/rfc-remove-arith-math-ops-on-tensors/74357/63.
---
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 19 +++++++--
 mlir/test/Dialect/Arith/canonicalize.mlir     |  8 ----
 mlir/test/Dialect/Arith/invalid.mlir          | 42 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 46248dad3be9e0..81ed0f924a2e2c 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -83,12 +83,25 @@ class Arith_FloatBinaryOp<string mnemonic, list<Trait> traits = []> :
                           attr-dict `:` type($result) }];
 }
 
+// Checks that tensor input and outputs have identical shapes. This is stricker
+// than the verification done in `SameOperandsAndResultShape` that allows for
+// tensor dimensions to be 'compatible' (e.g., dynamic dimensions being
+// compatible with static ones).
+def SameInputOutputTensorDims : PredOpTrait<
+    "input and output have the same tensor dimensions",
+    AllMatchSameOperatorPred<["in", "out"],
+      "(::llvm::isa<::mlir::TensorType>($_self.getType()) ?"
+      " ::llvm::cast<::mlir::TensorType>($_self.getType()).getShape() :"
+      " ::llvm::ArrayRef<int64_t>{})">>;
+
 // Base class for arithmetic cast operations. Requires a single operand and
-// result. If either is a shaped type, then the other must be of the same shape.
+// result. If either is a shaped type, then the other must be of the same
+// shape.  In the case of tensor types, this also includes the corresponding
+// operand/result dimensions being equal.
 class Arith_CastOp<string mnemonic, TypeConstraint From, TypeConstraint To,
                    list<Trait> traits = []> :
     Arith_Op<mnemonic, traits # [Pure, SameOperandsAndResultShape,
-      DeclareOpInterfaceMethods<CastOpInterface>]>,
+      SameInputOutputTensorDims, DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins From:$in)>,
     Results<(outs To:$out)> {
   let assemblyFormat = "$in attr-dict `:` type($in) `to` type($out)";
@@ -1231,7 +1244,7 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> {
 
 def Arith_TruncFOp :
     Arith_Op<"truncf",
-      [Pure, SameOperandsAndResultShape,
+      [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims,
        DeclareOpInterfaceMethods<ArithRoundingModeInterface>,
        DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins FloatLike:$in,
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 1a387c20c4b297..e4f95bb0545a20 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2950,14 +2950,6 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
-// Just checks that this doesn't crash.
-// CHECK-LABEL: @signedExtendSplatAsDynamicShape
-func.func @signedExtendSplatAsDynamicShape() -> tensor<?xi64> {
-  %splat = arith.constant dense<5> : tensor<2xi16>
-  %extsplat = arith.extsi %splat : tensor<2xi16> to tensor<?xi64>
-  return %extsplat : tensor<?xi64>
-}
-
 // CHECK-LABEL: @extsi_i0
 //       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
 //       CHECK:   return %[[ZERO]] : i16
diff --git a/mlir/test/Dialect/Arith/invalid.mlir b/mlir/test/Dialect/Arith/invalid.mlir
index ada849220bb839..652aa738ad3924 100644
--- a/mlir/test/Dialect/Arith/invalid.mlir
+++ b/mlir/test/Dialect/Arith/invalid.mlir
@@ -1,13 +1,21 @@
 // RUN: mlir-opt -split-input-file %s -verify-diagnostics
 
 func.func @test_index_cast_shape_error(%arg0 : tensor<index>) -> tensor<2xi64> {
-  // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
   %0 = arith.index_cast %arg0 : tensor<index> to tensor<2xi64>
   return %0 : tensor<2xi64>
 }
 
 // -----
 
+func.func @test_index_cast_shape_dim_error(%arg0 : tensor<2xindex>) -> tensor<?xi64> {
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor<?xi64>
+  return %0 : tensor<?xi64>
+}
+
+// -----
+
 func.func @test_index_cast_tensor_error(%arg0 : tensor<index>) -> i64 {
   // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
   %0 = arith.index_cast %arg0 : tensor<index> to i64
@@ -655,6 +663,14 @@ func.func @extsi_scalable_to_fl(%arg0 : vector<[4]xi32>) {
 
 // -----
 
+func.func @extsi_tensor_dim(%arg0 : tensor<4xi32>) {
+  // expected-error@+1 {{'arith.extsi' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.extsi %arg0 : tensor<4xi32> to tensor<?xi64>
+  return
+}
+
+// -----
+
 func.func @extf_scalable_to_fl(%arg0 : vector<[4]xf32>) {
   // expected-error@+1 {{'arith.extf' op requires the same shape for all operands and results}}
   %0 = arith.extf %arg0 : vector<[4]xf32> to vector<4xf64>
@@ -703,6 +719,22 @@ func.func @bitcast_scalable_to_fl(%arg0 : vector<[4]xf32>) {
 
 // -----
 
+func.func @bitcast_tensor_dim(%arg0 : tensor<4xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<4xf32> to tensor<?xi32>
+  return
+}
+
+// -----
+
+func.func @bitcast_tensor_dim(%arg0 : tensor<?xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<?xf32> to tensor<4xi32>
+  return
+}
+
+// -----
+
 func.func @trunci_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.trunci' op requires the same shape for all operands and results}}
   %0 = arith.trunci %arg0 : vector<4xi32> to vector<[4]xi8>
@@ -719,6 +751,14 @@ func.func @truncf_fl_to_scalable(%arg0 : vector<4xf64>) {
 
 // -----
 
+func.func @truncf_tensor_dim(%arg0 : tensor<4xf64>) {
+  // expected-error@+1 {{'arith.truncf' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.truncf %arg0 : tensor<4xf64> to tensor<?xf32>
+  return
+}
+
+// -----
+
 func.func @extui_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.extui' op requires the same shape for all operands and results}}
   %0 = arith.extui %arg0 : vector<4xi32> to vector<[4]xi64>

From 1c108c80dc5b878452c00e1411cb530a66122ea5 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Wed, 29 May 2024 00:27:07 +0000
Subject: [PATCH 041/230] Mark operator== const to avoid errors when asserts
 are enabled

Without this change, the build will fail like so:

llvm-project/lld/MachO/ObjC.cpp:1387:75: error: ISO C++20 considers use of overloaded operator '==' (with operand types 'ObjcCategoryMerger::PointerListInfo' and 'ObjcCategoryMerger::PointerListInfo') to be ambiguous despite there being a unique best viable function [-Werror,-Wambiguous-reversed-operator]
 1387 |       parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset) ==
      |       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^
 1388 |           parseProtocolListInfo(metaIsec, roClassLayout.baseProtocolsOffset) &&
      |           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/assert.h:100:27: note: expanded from macro 'assert'
  100 |      (static_cast <bool> (expr)                                         \
      |                           ^~~~
llvm-project/lld/MachO/ObjC.cpp:391:17: note: ambiguity is between a regular call to this operator and a call with the argument order reversed
  391 |     inline bool operator==(const PointerListInfo &cmp) {
      |                 ^
llvm-project/lld/MachO/ObjC.cpp:391:17: note: mark 'operator==' as const or add a matching 'operator!=' to resolve the ambiguity
1 error generated.
---
 lld/MachO/ObjC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 635ded554497ba..6e857cfcd92f6d 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -388,7 +388,7 @@ class ObjcCategoryMerger {
         : categoryPrefix(_categoryPrefix),
           pointersPerStruct(_pointersPerStruct) {}
 
-    inline bool operator==(const PointerListInfo &cmp) {
+    inline bool operator==(const PointerListInfo &cmp) const {
       return pointersPerStruct == cmp.pointersPerStruct &&
              structSize == cmp.structSize && structCount == cmp.structCount &&
              allPtrs == cmp.allPtrs;

From 44d4b3b2eebdd5eed95dd78dc3939dd9f5ebc5e6 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Wed, 29 May 2024 01:30:30 +0100
Subject: [PATCH 042/230] [libc++][test] Close LWG3382 and add tests (#93039)

---
 libcxx/docs/Status/Cxx20Issues.csv            |  2 +-
 .../sequences/array/lwg3382.compile.pass.cpp  | 25 +++++++++++++++++++
 .../pairs/pairs.pair/lwg3382.compile.pass.cpp | 23 +++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp
 create mode 100644 libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 5f83fa3a92e872..179958854e8cb2 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -285,7 +285,7 @@
 "`3379 <https://wg21.link/LWG3379>`__","""``safe``\ "" in several library names is misleading","Prague","|Complete|","15.0","|ranges|"
 "`3380 <https://wg21.link/LWG3380>`__","``common_type``\  and comparison categories","Prague","|Complete|","15.0","|spaceship|"
 "`3381 <https://wg21.link/LWG3381>`__","``begin``\  and ``data``\  must agree for ``contiguous_range``\ ","Prague","|Nothing To Do|","","|ranges|"
-"`3382 <https://wg21.link/LWG3382>`__","NTTP for ``pair``\  and ``array``\ ","Prague","",""
+"`3382 <https://wg21.link/LWG3382>`__","NTTP for ``pair``\  and ``array``\ ","Prague","|Nothing To Do|",""
 "`3383 <https://wg21.link/LWG3383>`__","|sect|\ [time.zone.leap.nonmembers] ``sys_seconds``\  should be replaced with ``seconds``\ ","Prague","|Complete|","19.0","|chrono|"
 "`3384 <https://wg21.link/LWG3384>`__","``transform_view::*sentinel*``\  has an incorrect ``operator-``\ ","Prague","|Complete|","15.0","|ranges|"
 "`3385 <https://wg21.link/LWG3385>`__","``common_iterator``\  is not sufficiently constrained for non-copyable iterators","Prague","|Complete|","15.0","|ranges|"
diff --git a/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp
new file mode 100644
index 00000000000000..8eed20990cc00b
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <array>
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <array>
+
+template <auto>
+struct Test {};
+
+void test() {
+  // LWG 3382. NTTP for pair and array
+  // https://cplusplus.github.io/LWG/issue3382
+  constexpr std::array<int, 5> a{};
+  [[maybe_unused]] Test<a> test1{};
+
+  constexpr std::array<int, 0> b{};
+  [[maybe_unused]] Test<b> test2{};
+}
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp
new file mode 100644
index 00000000000000..dce9a5df220b21
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <utility>
+
+template <auto>
+struct Test {};
+
+void test() {
+  // LWG 3382. NTTP for pair and array
+  // https://cplusplus.github.io/LWG/issue3382
+#if !defined(_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR)
+  constexpr std::pair<int, long> a{};
+  [[maybe_unused]] Test<a> test1{};
+#endif
+}

From d868f097053e19e828d7366f5dbb88add16998a2 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Wed, 29 May 2024 01:32:44 +0100
Subject: [PATCH 043/230] [libc++] LWG3223 Broken requirements for shared_ptr
 converting constructors (#93071)

---
 libcxx/docs/Status/Cxx20Issues.csv            |  2 +-
 libcxx/include/__memory/shared_ptr.h          |  7 ++-
 .../nullptr_t_deleter.pass.cpp                | 20 ++++++++
 .../nullptr_t_deleter_allocator.pass.cpp      | 21 ++++++++
 .../pointer_deleter.pass.cpp                  | 44 +++--------------
 .../pointer_deleter_allocator.pass.cpp        | 47 ++++--------------
 .../util.smartptr.shared.const/types.h        | 49 +++++++++++++++++++
 7 files changed, 113 insertions(+), 77 deletions(-)
 create mode 100644 libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 179958854e8cb2..6fc40270af1580 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -200,7 +200,7 @@
 "`3200 <https://wg21.link/LWG3200>`__","``midpoint``\  should not constrain ``T``\  is complete","Prague","|Nothing To Do|",""
 "`3201 <https://wg21.link/LWG3201>`__","``lerp``\  should be marked as ``noexcept``\ ","Prague","|Complete|",""
 "`3226 <https://wg21.link/LWG3226>`__","``zoned_time``\  constructor from ``string_view``\  should accept ``zoned_time<Duration2, TimeZonePtr2>``\ ","Prague","","","|chrono|"
-"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","",""
+"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","|Complete|","19.0"
 "`3237 <https://wg21.link/LWG3237>`__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0"
 "`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","",""
 "`3242 <https://wg21.link/LWG3242>`__","``std::format``\ : missing rules for ``arg-id``\  in ``width``\  and ``precision``\ ","Prague","|Complete|","14.0","|format|"
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 992b1ba43f100d..de5707c4a67b0c 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -403,6 +403,9 @@ struct __shared_ptr_deleter_ctor_reqs {
                             __well_formed_deleter<_Dp, _Yp*>::value;
 };
 
+template <class _Dp, class _Tp>
+using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
 #else
@@ -498,7 +501,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
   }
 
-  template <class _Dp>
+  template <class _Dp, __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp, _Tp>::value, int> = 0 >
   _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d) : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
@@ -518,7 +521,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
   }
 
-  template <class _Dp, class _Alloc>
+  template <class _Dp, class _Alloc, __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp, _Tp>::value, int> = 0 >
   _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a) : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
index 49497b6956b9fb..13340ed5294c05 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,6 +29,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter>::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter>::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter>::value, "");
+#endif
+
 int main(int, char**)
 {
     {
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
index 4e9fc227b99e81..53ca6fb5b234d4 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,6 +30,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+#endif
+
 int main(int, char**)
 {
     test_allocator_statistics alloc_stats;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
index 42225a4b0be7ec..9c1e9b72be573c 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,38 +30,8 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int> >::value, "");
@@ -68,12 +40,12 @@ static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_dele
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int> >::value, "");
 #endif
 
 int main(int, char**)
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
index a110525b9b922d..9dffbcdd59a735 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,38 +29,8 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
@@ -68,12 +39,12 @@ static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_dele
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A>, test_allocator<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int> >::value, "");
 #endif
 
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
new file mode 100644
index 00000000000000..5bfb3d70febea0
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+#define TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+
+#include <type_traits>
+
+struct bad_ty {};
+
+struct bad_deleter {
+  void operator()(bad_ty) {}
+};
+
+struct no_move_deleter {
+  no_move_deleter(no_move_deleter const&) = delete;
+  no_move_deleter(no_move_deleter&&)      = delete;
+  void operator()(int*) {}
+};
+
+static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
+
+struct no_nullptr_deleter {
+  void operator()(int*) const {}
+  void operator()(std::nullptr_t) const = delete;
+};
+
+struct Base {};
+struct Derived : Base {};
+
+template <class T>
+class MoveDeleter {
+  MoveDeleter();
+  MoveDeleter(MoveDeleter const&);
+
+public:
+  MoveDeleter(MoveDeleter&&) {}
+
+  explicit MoveDeleter(int) {}
+
+  void operator()(T* ptr) { delete ptr; }
+};
+
+#endif // TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H

From 2ae3f7c29c1149098827df7edafa761e3e3eb420 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Wed, 29 May 2024 01:34:29 +0100
Subject: [PATCH 044/230] [libc++][test] Close LWG3238 and add tests (#93043)

---
 libcxx/docs/Status/Cxx20Issues.csv            |  2 +-
 .../func.wrap.func.con/deduct_F.pass.cpp      | 30 +++++++++++++++----
 .../func.wrap.func.con/deduct_F.verify.cpp    | 30 -------------------
 3 files changed, 26 insertions(+), 36 deletions(-)
 delete mode 100644 libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 6fc40270af1580..54517ab002b86b 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -202,7 +202,7 @@
 "`3226 <https://wg21.link/LWG3226>`__","``zoned_time``\  constructor from ``string_view``\  should accept ``zoned_time<Duration2, TimeZonePtr2>``\ ","Prague","","","|chrono|"
 "`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","|Complete|","19.0"
 "`3237 <https://wg21.link/LWG3237>`__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0"
-"`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","",""
+"`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","|Nothing To Do|",""
 "`3242 <https://wg21.link/LWG3242>`__","``std::format``\ : missing rules for ``arg-id``\  in ``width``\  and ``precision``\ ","Prague","|Complete|","14.0","|format|"
 "`3243 <https://wg21.link/LWG3243>`__","``std::format``\  and negative zeroes","Prague","|Complete|","14.0","|format|"
 "`3247 <https://wg21.link/LWG3247>`__","``ranges::iter_move``\  should perform ADL-only lookup of ``iter_move``\ ","Prague","|Complete|","15.0","|ranges|"
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
index ef43ab9b64b5b5..381bcda761700c 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
@@ -118,10 +118,14 @@ int main(int, char**) {
 // Make sure we fail in a SFINAE-friendly manner when we try to deduce
 // from a type without a valid call operator.
 template <typename F, typename = decltype(std::function{std::declval<F>()})>
-constexpr bool can_deduce() { return true; }
+constexpr bool can_deduce_test(int) { return true; }
 template <typename F>
-constexpr bool can_deduce(...) { return false; }
+constexpr bool can_deduce_test(...) { return false; }
 
+template <typename F>
+constexpr bool can_deduce = can_deduce_test<F>(0);
+
+struct valid { int operator()() const; };
 struct invalid1 { };
 struct invalid2 {
   template <typename ...Args>
@@ -131,6 +135,22 @@ struct invalid3 {
   void operator()(int);
   void operator()(long);
 };
-static_assert(!can_deduce<invalid1>());
-static_assert(!can_deduce<invalid2>());
-static_assert(!can_deduce<invalid3>());
+static_assert( can_deduce<valid>);
+static_assert(!can_deduce<invalid1>);
+static_assert(!can_deduce<invalid2>);
+static_assert(!can_deduce<invalid3>);
+
+
+// LWG 3238. Insufficiently-defined behavior of std::function deduction guides
+// https://cplusplus.github.io/LWG/issue3238
+// The deduction guides for std::function do not handle rvalue-ref qualified
+// call operators and C-style variadics. It also doesn't deduce from nullptr_t.
+// Make sure we stick to the specification.
+
+struct invalid_rvalue_ref { R operator()() && { return {}; } };
+struct invalid_c_vararg { R operator()(int, ...) { return {}; } };
+
+static_assert(!can_deduce<invalid_rvalue_ref>);
+static_assert(!can_deduce<invalid_c_vararg>);
+static_assert(!can_deduce<std::nullptr_t>);
+
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
deleted file mode 100644
index 8a42d3be3571c0..00000000000000
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <functional>
-
-// template<class F>
-// function(F) -> function<see-below>;
-
-// UNSUPPORTED: c++03, c++11, c++14
-
-// The deduction guides for std::function do not handle rvalue-ref qualified
-// call operators and C-style variadics. It also doesn't deduce from nullptr_t.
-// Make sure we stick to the specification.
-
-#include <functional>
-
-struct R { };
-struct f0 { R operator()() && { return {}; } };
-struct f1 { R operator()(int, ...) { return {}; } };
-
-void f() {
-    std::function f = f0{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-    std::function g = f1{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-    std::function h = nullptr; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-}

From 0380044e16a1c016e001a56c0ca7f4db649a6cae Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Tue, 28 May 2024 17:47:08 -0700
Subject: [PATCH 045/230] Fix the EditLine unittest build on Darwin after PR
 92865

There was a Darwin only use of setupterm (under USE_SETUPTERM_WORKAROUND)
that required libcurses.dylib.  That was added to the main build, but
not to the unittest.
---
 lldb/unittests/CMakeLists.txt          | 4 +++-
 lldb/unittests/Editline/CMakeLists.txt | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt
index a2585a94b61558..728dec5006d6bf 100644
--- a/lldb/unittests/CMakeLists.txt
+++ b/lldb/unittests/CMakeLists.txt
@@ -51,11 +51,13 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
   # FIXME: APITests.exe is not a valid googletest binary.
   add_subdirectory(API)
 endif()
+if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" OR LLDB_ENABLE_CURSES)
+  add_subdirectory(Editline)
+endif()
 add_subdirectory(Breakpoint)
 add_subdirectory(Core)
 add_subdirectory(DataFormatter)
 add_subdirectory(Disassembler)
-add_subdirectory(Editline)
 add_subdirectory(Expression)
 add_subdirectory(Host)
 add_subdirectory(Interpreter)
diff --git a/lldb/unittests/Editline/CMakeLists.txt b/lldb/unittests/Editline/CMakeLists.txt
index 4b2643d15c5fc6..f213bfd1ab5813 100644
--- a/lldb/unittests/Editline/CMakeLists.txt
+++ b/lldb/unittests/Editline/CMakeLists.txt
@@ -5,4 +5,5 @@ add_lldb_unittest(EditlineTests
     lldbHost
     lldbUtility
     LLVMTestingSupport
+    ${CURSES_LIBRARIES}
   )

From d11922ebb26d84d7807be7f6fbf4d7e92c97455d Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 20:53:58 -0400
Subject: [PATCH 046/230] Remove unneeded debug logging

---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index 88924fb3cd7791..43a1b97f1947d1 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -45,7 +45,6 @@ jobs:
             check_run_ids = [];
             for (check_run of check_suites.data.check_runs) {
               console.log('Checking check run: ' + check_run.id);
-              console.log(check_run);
               if (check_run.status != 'completed') {
                 console.log('Check run was not completed. Skipping.');
                 continue;

From f0b57b60e3b47bb9f9181d8be68473706b883430 Mon Sep 17 00:00:00 2001
From: "Ruiling, Song" <ruiling.song@amd.com>
Date: Wed, 29 May 2024 08:58:19 +0800
Subject: [PATCH 047/230] [Coroutines] Remove one construction of DominatorTree
 (#93507)

The DominatorTree can be reused if no CFG changes.
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 38b8dab984db3a..8e829a53aeca27 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2756,12 +2756,11 @@ static void sinkSpillUsesAfterCoroBegin(Function &F,
 /// after the suspend block. Doing so minimizes the lifetime of each variable,
 /// hence minimizing the amount of data we end up putting on the frame.
 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
-                                     SuspendCrossingInfo &Checker) {
+                                     SuspendCrossingInfo &Checker,
+                                     const DominatorTree &DT) {
   if (F.hasOptNone())
     return;
 
-  DominatorTree DT(F);
-
   // Collect all possible basic blocks which may dominate all uses of allocas.
   SmallPtrSet<BasicBlock *, 4> DomSet;
   DomSet.insert(&F.getEntryBlock());
@@ -3149,12 +3148,13 @@ void coro::buildCoroutineFrame(
 
   doRematerializations(F, Checker, MaterializableCallback);
 
+  const DominatorTree DT(F);
   FrameDataInfo FrameData;
   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
   SmallVector<Instruction*, 4> DeadInstructions;
   if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
       Shape.ABI != coro::ABI::RetconOnce)
-    sinkLifetimeStartMarkers(F, Shape, Checker);
+    sinkLifetimeStartMarkers(F, Shape, Checker, DT);
 
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
@@ -3162,7 +3162,6 @@ void coro::buildCoroutineFrame(
       if (Checker.isDefinitionAcrossSuspend(A, U))
         FrameData.Spills[&A].push_back(cast<Instruction>(U));
 
-  const DominatorTree DT(F);
   for (Instruction &I : instructions(F)) {
     // Values returned from coroutine structure intrinsics should not be part
     // of the Coroutine Frame.

From e492aa5adbccb9f4025af7c4179f75378fcad41a Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Tue, 28 May 2024 21:07:55 -0400
Subject: [PATCH 048/230] Remove one more unneeded debug log line

---
 .github/workflows/restart-preempted-libcxx-jobs.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
index 43a1b97f1947d1..71e27ff2abb9f0 100644
--- a/.github/workflows/restart-preempted-libcxx-jobs.yaml
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -67,7 +67,6 @@ jobs:
                 check_run_id: check_run_id
               })
               
-              console.log(annotations);
               for (annotation of annotations.data) {
                 if (annotation.annotation_level != 'failure') {
                   continue;

From f9672cb775afc47e5210a111d248a01c23c428fe Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Wed, 29 May 2024 09:09:36 +0800
Subject: [PATCH 049/230] [NFC][libc++] Mark LWG3951 as implemented (#93191)

Since we have already addressed the LWG issue, this PR marks LWG3951 as
implemented.

Signed-off-by: yronglin <yronglin777@gmail.com>
Co-authored-by: A. Jiang <de34@live.cn>
---
 libcxx/docs/Status/Cxx2cIssues.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 76717e1d3448a5..8d24457186310c 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -29,7 +29,7 @@
 "`3947 <https://wg21.link/LWG3947>`__","Unexpected constraints on ``adjacent_transform_view::base()``","Kona November 2023","","","|ranges|"
 "`3948 <https://wg21.link/LWG3948>`__","``possibly-const-range and as-const-pointer`` should be ``noexcept``","Kona November 2023","","","|ranges|"
 "`3949 <https://wg21.link/LWG3949>`__","``std::atomic<bool>``'s trivial destructor dropped in C++17 spec wording","Kona November 2023","","",""
-"`3951 <https://wg21.link/LWG3951>`__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","","",""
+"`3951 <https://wg21.link/LWG3951>`__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","|Complete|","16.0",""
 "`3953 <https://wg21.link/LWG3953>`__","``iter_move`` for ``common_iterator`` and ``counted_iterator`` should return ``decltype(auto)``","Kona November 2023","","","|ranges|"
 "`3957 <https://wg21.link/LWG3957>`__","[container.alloc.reqmts] The value category of v should be claimed","Kona November 2023","","",""
 "`3965 <https://wg21.link/LWG3965>`__","Incorrect example in [format.string.escaped] p3 for formatting of combining characters","Kona November 2023","|Complete|","19.0","|format|"

From 6abc3876c35bbe8fb5dd6435dc60f2c816b97ef6 Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Tue, 28 May 2024 18:16:13 -0700
Subject: [PATCH 050/230] Revert "Fix the EditLine unittest build on Darwin
 after PR 92865"

This reverts commit 0380044e16a1c016e001a56c0ca7f4db649a6cae.

While I figure out some mysterious CMake error.
---
 lldb/unittests/CMakeLists.txt          | 4 +---
 lldb/unittests/Editline/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt
index 728dec5006d6bf..a2585a94b61558 100644
--- a/lldb/unittests/CMakeLists.txt
+++ b/lldb/unittests/CMakeLists.txt
@@ -51,13 +51,11 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
   # FIXME: APITests.exe is not a valid googletest binary.
   add_subdirectory(API)
 endif()
-if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" OR LLDB_ENABLE_CURSES)
-  add_subdirectory(Editline)
-endif()
 add_subdirectory(Breakpoint)
 add_subdirectory(Core)
 add_subdirectory(DataFormatter)
 add_subdirectory(Disassembler)
+add_subdirectory(Editline)
 add_subdirectory(Expression)
 add_subdirectory(Host)
 add_subdirectory(Interpreter)
diff --git a/lldb/unittests/Editline/CMakeLists.txt b/lldb/unittests/Editline/CMakeLists.txt
index f213bfd1ab5813..4b2643d15c5fc6 100644
--- a/lldb/unittests/Editline/CMakeLists.txt
+++ b/lldb/unittests/Editline/CMakeLists.txt
@@ -5,5 +5,4 @@ add_lldb_unittest(EditlineTests
     lldbHost
     lldbUtility
     LLVMTestingSupport
-    ${CURSES_LIBRARIES}
   )

From 04f01a2b9cedc291fa7dd941de841dc957c75a33 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 28 May 2024 18:29:11 -0700
Subject: [PATCH 051/230] [libc++] Make the __availability header a sub-header
 of __config (#93083)

In essence, this header has always been related to configuration of
the library but we didn't want to put it inside <__config> due to
complexity reasons. Now that we have sub-headers in <__config>, we
can move <__availability> to it and stop including it everywhere since
we already obtain the required macros via <__config>.
---
 libcxx/CMakeLists.txt                         |  2 +-
 libcxx/include/CMakeLists.txt                 |  3 +-
 libcxx/include/__atomic/atomic_base.h         |  1 -
 libcxx/include/__atomic/atomic_flag.h         |  1 -
 libcxx/include/__atomic/atomic_sync.h         |  1 -
 .../__charconv/to_chars_floating_point.h      |  1 -
 libcxx/include/__chrono/file_clock.h          |  1 -
 libcxx/include/__chrono/tzdb_list.h           |  1 -
 libcxx/include/__config                       | 29 +-----------
 .../availability.h}                           |  9 ++--
 libcxx/include/__configuration/language.h     | 46 +++++++++++++++++++
 libcxx/include/__exception/exception_ptr.h    |  1 -
 .../include/__expected/bad_expected_access.h  |  1 -
 libcxx/include/__filesystem/directory_entry.h |  1 -
 .../include/__filesystem/directory_iterator.h |  1 -
 .../include/__filesystem/filesystem_error.h   |  1 -
 libcxx/include/__filesystem/operations.h      |  1 -
 libcxx/include/__filesystem/path.h            |  1 -
 libcxx/include/__filesystem/path_iterator.h   |  1 -
 .../recursive_directory_iterator.h            |  1 -
 libcxx/include/__filesystem/u8path.h          |  1 -
 libcxx/include/__functional/function.h        |  1 -
 libcxx/include/__fwd/memory_resource.h        |  1 -
 libcxx/include/__fwd/string.h                 |  1 -
 .../__memory_resource/memory_resource.h       |  1 -
 .../monotonic_buffer_resource.h               |  1 -
 .../__memory_resource/polymorphic_allocator.h |  1 -
 .../synchronized_pool_resource.h              |  1 -
 .../unsynchronized_pool_resource.h            |  1 -
 libcxx/include/__ostream/print.h              |  1 -
 libcxx/include/__stop_token/stop_callback.h   |  1 -
 libcxx/include/__stop_token/stop_source.h     |  1 -
 libcxx/include/__stop_token/stop_state.h      |  1 -
 libcxx/include/__stop_token/stop_token.h      |  1 -
 libcxx/include/__thread/jthread.h             |  1 -
 libcxx/include/__thread/poll_with_backoff.h   |  1 -
 libcxx/include/__verbose_abort                |  1 -
 libcxx/include/barrier                        |  1 -
 libcxx/include/condition_variable             |  1 -
 libcxx/include/deque                          |  1 -
 libcxx/include/forward_list                   |  1 -
 libcxx/include/fstream                        |  1 -
 libcxx/include/latch                          |  1 -
 libcxx/include/list                           |  1 -
 libcxx/include/map                            |  1 -
 libcxx/include/module.modulemap               |  6 +--
 libcxx/include/optional                       |  1 -
 libcxx/include/print                          |  1 -
 libcxx/include/regex                          |  1 -
 libcxx/include/semaphore                      |  1 -
 libcxx/include/set                            |  1 -
 libcxx/include/sstream                        |  1 -
 libcxx/include/unordered_map                  |  1 -
 libcxx/include/unordered_set                  |  1 -
 libcxx/include/variant                        |  1 -
 libcxx/include/vector                         |  1 -
 libcxx/include/version                        |  1 -
 libcxx/src/optional.cpp                       |  1 -
 libcxx/src/ostream.cpp                        |  1 -
 ...lity-with-pedantic-errors.compile.pass.cpp |  2 +-
 .../generate_feature_test_macro_components.py |  4 +-
 61 files changed, 60 insertions(+), 94 deletions(-)
 rename libcxx/include/{__availability => __configuration/availability.h} (98%)
 create mode 100644 libcxx/include/__configuration/language.h

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index cb5e0e5e6cdb56..bbde9abc57919e 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -122,7 +122,7 @@ option(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS
    on definitions in a shared library. By default, we assume that we're not building
    libc++ for any specific vendor, and we disable those annotations. Vendors wishing
    to provide compile-time errors when using features unavailable on some version of
-   the shared library they shipped should turn this on and see `include/__availability`
+   the shared library they shipped should turn this on and see `include/__configuration/availability.h`
    for more details." OFF)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 161d7a7d215bdd..cfe1f44777bcac 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -219,7 +219,6 @@ set(files
   __atomic/kill_dependency.h
   __atomic/memory_order.h
   __atomic/to_gcc_order.h
-  __availability
   __bit/bit_cast.h
   __bit/bit_ceil.h
   __bit/bit_floor.h
@@ -315,7 +314,9 @@ set(files
   __condition_variable/condition_variable.h
   __config
   __configuration/abi.h
+  __configuration/availability.h
   __configuration/compiler.h
+  __configuration/language.h
   __configuration/platform.h
   __coroutine/coroutine_handle.h
   __coroutine/coroutine_traits.h
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index e9badccc25a620..d7a5b99b546910 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -14,7 +14,6 @@
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_integral.h>
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 3ec3366ecaaf98..00b157cdff78b7 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -13,7 +13,6 @@
 #include <__atomic/contention_t.h>
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 175700be54c010..1de5037329f812 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -13,7 +13,6 @@
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
 #include <__atomic/to_gcc_order.h>
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__charconv/to_chars_floating_point.h b/libcxx/include/__charconv/to_chars_floating_point.h
index 08720e1078852b..118f316b21a102 100644
--- a/libcxx/include/__charconv/to_chars_floating_point.h
+++ b/libcxx/include/__charconv/to_chars_floating_point.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H
 #define _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H
 
-#include <__availability>
 #include <__charconv/chars_format.h>
 #include <__charconv/to_chars_result.h>
 #include <__config>
diff --git a/libcxx/include/__chrono/file_clock.h b/libcxx/include/__chrono/file_clock.h
index 7d25729fec013a..4dd3f88ce5ba4b 100644
--- a/libcxx/include/__chrono/file_clock.h
+++ b/libcxx/include/__chrono/file_clock.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___CHRONO_FILE_CLOCK_H
 #define _LIBCPP___CHRONO_FILE_CLOCK_H
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/time_point.h>
diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h
index 62db7e3d2e0b5e..aeef4fe1aba3c1 100644
--- a/libcxx/include/__chrono/tzdb_list.h
+++ b/libcxx/include/__chrono/tzdb_list.h
@@ -16,7 +16,6 @@
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
 #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
 
-#  include <__availability>
 #  include <__chrono/time_zone.h>
 #  include <__chrono/tzdb.h>
 #  include <__config>
diff --git a/libcxx/include/__config b/libcxx/include/__config
index e048dad52c4664..79422e8f6c5d1b 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -12,6 +12,7 @@
 
 #include <__config_site>
 #include <__configuration/abi.h>
+#include <__configuration/availability.h>
 #include <__configuration/compiler.h>
 #include <__configuration/platform.h>
 
@@ -35,25 +36,6 @@
 #    define _LIBCPP_FREESTANDING
 #  endif
 
-// NOLINTBEGIN(libcpp-cpp-version-check)
-#  ifndef _LIBCPP_STD_VER
-#    if __cplusplus <= 201103L
-#      define _LIBCPP_STD_VER 11
-#    elif __cplusplus <= 201402L
-#      define _LIBCPP_STD_VER 14
-#    elif __cplusplus <= 201703L
-#      define _LIBCPP_STD_VER 17
-#    elif __cplusplus <= 202002L
-#      define _LIBCPP_STD_VER 20
-#    elif __cplusplus <= 202302L
-#      define _LIBCPP_STD_VER 23
-#    else
-// Expected release year of the next C++ standard
-#      define _LIBCPP_STD_VER 26
-#    endif
-#  endif // _LIBCPP_STD_VER
-// NOLINTEND(libcpp-cpp-version-check)
-
 // HARDENING {
 
 // TODO(hardening): deprecate this in LLVM 19.
@@ -364,10 +346,6 @@ typedef __char32_t char32_t;
 
 #  endif
 
-#  if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L
-#    define _LIBCPP_HAS_NO_EXCEPTIONS
-#  endif
-
 #  define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
 
 #  if defined(_LIBCPP_COMPILER_CLANG_BASED)
@@ -840,11 +818,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_CONSTEXPR_SINCE_CXX23
 #  endif
 
-// Try to find out if RTTI is disabled.
-#  if !defined(__cpp_rtti) || __cpp_rtti < 199711L
-#    define _LIBCPP_HAS_NO_RTTI
-#  endif
-
 #  ifndef _LIBCPP_WEAK
 #    define _LIBCPP_WEAK __attribute__((__weak__))
 #  endif
diff --git a/libcxx/include/__availability b/libcxx/include/__configuration/availability.h
similarity index 98%
rename from libcxx/include/__availability
rename to libcxx/include/__configuration/availability.h
index e44ac1962df363..1115431115ec38 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__configuration/availability.h
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___AVAILABILITY
-#define _LIBCPP___AVAILABILITY
+#ifndef _LIBCPP___CONFIGURATION_AVAILABILITY_H
+#define _LIBCPP___CONFIGURATION_AVAILABILITY_H
 
-#include <__config>
+#include <__configuration/compiler.h>
+#include <__configuration/language.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -374,4 +375,4 @@
 #  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
 #endif
 
-#endif // _LIBCPP___AVAILABILITY
+#endif // _LIBCPP___CONFIGURATION_AVAILABILITY_H
diff --git a/libcxx/include/__configuration/language.h b/libcxx/include/__configuration/language.h
new file mode 100644
index 00000000000000..fa62a7b6f5c2a1
--- /dev/null
+++ b/libcxx/include/__configuration/language.h
@@ -0,0 +1,46 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CONFIGURATION_LANGUAGE_H
+#define _LIBCPP___CONFIGURATION_LANGUAGE_H
+
+#include <__config_site>
+
+#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
+#  pragma GCC system_header
+#endif
+
+// NOLINTBEGIN(libcpp-cpp-version-check)
+#ifdef __cplusplus
+#  if __cplusplus <= 201103L
+#    define _LIBCPP_STD_VER 11
+#  elif __cplusplus <= 201402L
+#    define _LIBCPP_STD_VER 14
+#  elif __cplusplus <= 201703L
+#    define _LIBCPP_STD_VER 17
+#  elif __cplusplus <= 202002L
+#    define _LIBCPP_STD_VER 20
+#  elif __cplusplus <= 202302L
+#    define _LIBCPP_STD_VER 23
+#  else
+// Expected release year of the next C++ standard
+#    define _LIBCPP_STD_VER 26
+#  endif
+#endif // __cplusplus
+// NOLINTEND(libcpp-cpp-version-check)
+
+#if !defined(__cpp_rtti) || __cpp_rtti < 199711L
+#  define _LIBCPP_HAS_NO_RTTI
+#endif
+
+#if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L
+#  define _LIBCPP_HAS_NO_EXCEPTIONS
+#endif
+
+#endif // _LIBCPP___CONFIGURATION_LANGUAGE_H
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 868fd7c015339c..0a8337fa39de39 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 #define _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 
-#include <__availability>
 #include <__config>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h
index ef29fa50883136..1b734389e8311f 100644
--- a/libcxx/include/__expected/bad_expected_access.h
+++ b/libcxx/include/__expected/bad_expected_access.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 
-#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 016ad94a853dc2..96d88dcd90b4c0 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H
 #define _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H
 
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__compare/ordering.h>
 #include <__config>
diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h
index a5aa5ff5432dab..e0246d8001e195 100644
--- a/libcxx/include/__filesystem/directory_iterator.h
+++ b/libcxx/include/__filesystem/directory_iterator.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__filesystem/directory_entry.h>
 #include <__filesystem/directory_options.h>
diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h
index bfdcc5eaee521f..80a11e3b1932c7 100644
--- a/libcxx/include/__filesystem/filesystem_error.h
+++ b/libcxx/include/__filesystem/filesystem_error.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H
 #define _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H
 
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <__memory/shared_ptr.h>
diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h
index 9bb83576f54bc0..f588189ed1d9de 100644
--- a/libcxx/include/__filesystem/operations.h
+++ b/libcxx/include/__filesystem/operations.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_OPERATIONS_H
 #define _LIBCPP___FILESYSTEM_OPERATIONS_H
 
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__config>
 #include <__filesystem/copy_options.h>
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index 89d319b4b19b57..ff468d517722fe 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -12,7 +12,6 @@
 
 #include <__algorithm/replace.h>
 #include <__algorithm/replace_copy.h>
-#include <__availability>
 #include <__config>
 #include <__functional/unary_function.h>
 #include <__fwd/functional.h>
diff --git a/libcxx/include/__filesystem/path_iterator.h b/libcxx/include/__filesystem/path_iterator.h
index d2d65cd122cab8..f4d486d86cf380 100644
--- a/libcxx/include/__filesystem/path_iterator.h
+++ b/libcxx/include/__filesystem/path_iterator.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_PATH_ITERATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <__iterator/iterator_traits.h>
diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h
index a8af4f73b14a5f..caa1396eb301fc 100644
--- a/libcxx/include/__filesystem/recursive_directory_iterator.h
+++ b/libcxx/include/__filesystem/recursive_directory_iterator.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H
 #define _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H
 
-#include <__availability>
 #include <__config>
 #include <__filesystem/directory_entry.h>
 #include <__filesystem/directory_options.h>
diff --git a/libcxx/include/__filesystem/u8path.h b/libcxx/include/__filesystem/u8path.h
index bde878054865e1..dae5823128f028 100644
--- a/libcxx/include/__filesystem/u8path.h
+++ b/libcxx/include/__filesystem/u8path.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_U8PATH_H
 
 #include <__algorithm/unwrap_iter.h>
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <string>
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 36057706933d43..244e55be3403ca 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FUNCTIONAL_FUNCTION_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__functional/binary_function.h>
diff --git a/libcxx/include/__fwd/memory_resource.h b/libcxx/include/__fwd/memory_resource.h
index 03b78ad2bd3c0c..d68b2c2b631543 100644
--- a/libcxx/include/__fwd/memory_resource.h
+++ b/libcxx/include/__fwd/memory_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___FWD_MEMORY_RESOURCE_H
 #define _LIBCPP___FWD_MEMORY_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__fwd/string.h b/libcxx/include/__fwd/string.h
index 320c4e4c818361..2418e1f9b23d0d 100644
--- a/libcxx/include/__fwd/string.h
+++ b/libcxx/include/__fwd/string.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___FWD_STRING_H
 #define _LIBCPP___FWD_STRING_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/memory.h>
 #include <__fwd/memory_resource.h>
diff --git a/libcxx/include/__memory_resource/memory_resource.h b/libcxx/include/__memory_resource/memory_resource.h
index e605838bf5ea40..ea85e50cd568bc 100644
--- a/libcxx/include/__memory_resource/memory_resource.h
+++ b/libcxx/include/__memory_resource/memory_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/memory_resource.h>
 #include <cstddef>
diff --git a/libcxx/include/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__memory_resource/monotonic_buffer_resource.h
index 0c83f1ebc8db43..f45b30fdb38616 100644
--- a/libcxx/include/__memory_resource/monotonic_buffer_resource.h
+++ b/libcxx/include/__memory_resource/monotonic_buffer_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory_resource/memory_resource.h>
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 8fda201124387e..a71096d3e47847 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___MEMORY_RESOURCE_POLYMORPHIC_ALLOCATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__fwd/pair.h>
 #include <__memory_resource/memory_resource.h>
diff --git a/libcxx/include/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__memory_resource/synchronized_pool_resource.h
index b261fb0b194a8e..50a673c2861d10 100644
--- a/libcxx/include/__memory_resource/synchronized_pool_resource.h
+++ b/libcxx/include/__memory_resource/synchronized_pool_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory_resource/memory_resource.h>
 #include <__memory_resource/pool_options.h>
diff --git a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
index 81d5f9ec4da87d..783db84262af72 100644
--- a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
+++ b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory_resource/memory_resource.h>
 #include <__memory_resource/pool_options.h>
diff --git a/libcxx/include/__ostream/print.h b/libcxx/include/__ostream/print.h
index 97680cdab6da3c..8265ac00777e25 100644
--- a/libcxx/include/__ostream/print.h
+++ b/libcxx/include/__ostream/print.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___OSTREAM_PRINT_H
 #define _LIBCPP___OSTREAM_PRINT_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/ostream.h>
 #include <__iterator/ostreambuf_iterator.h>
diff --git a/libcxx/include/__stop_token/stop_callback.h b/libcxx/include/__stop_token/stop_callback.h
index 7b526820f98a37..760cf2bb55b0ce 100644
--- a/libcxx/include/__stop_token/stop_callback.h
+++ b/libcxx/include/__stop_token/stop_callback.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
 #define _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
 
-#include <__availability>
 #include <__concepts/constructible.h>
 #include <__concepts/destructible.h>
 #include <__concepts/invocable.h>
diff --git a/libcxx/include/__stop_token/stop_source.h b/libcxx/include/__stop_token/stop_source.h
index 1080069cf3b8be..70697462784ab4 100644
--- a/libcxx/include/__stop_token/stop_source.h
+++ b/libcxx/include/__stop_token/stop_source.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
 #define _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <__stop_token/stop_state.h>
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
index df07573f878628..b0eed13a143cfc 100644
--- a/libcxx/include/__stop_token/stop_state.h
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___STOP_TOKEN_STOP_STATE_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__stop_token/atomic_unique_lock.h>
 #include <__stop_token/intrusive_list_view.h>
diff --git a/libcxx/include/__stop_token/stop_token.h b/libcxx/include/__stop_token/stop_token.h
index f2eadb990bdeca..1bd75cbbf6f8d8 100644
--- a/libcxx/include/__stop_token/stop_token.h
+++ b/libcxx/include/__stop_token/stop_token.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
 #define _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
 
-#include <__availability>
 #include <__config>
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <__stop_token/stop_state.h>
diff --git a/libcxx/include/__thread/jthread.h b/libcxx/include/__thread/jthread.h
index 253e3a935d9b73..b3d5c25fb71c77 100644
--- a/libcxx/include/__thread/jthread.h
+++ b/libcxx/include/__thread/jthread.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___THREAD_JTHREAD_H
 #define _LIBCPP___THREAD_JTHREAD_H
 
-#include <__availability>
 #include <__config>
 #include <__functional/invoke.h>
 #include <__stop_token/stop_source.h>
diff --git a/libcxx/include/__thread/poll_with_backoff.h b/libcxx/include/__thread/poll_with_backoff.h
index d8354e6ca23980..4f961fe3f7629f 100644
--- a/libcxx/include/__thread/poll_with_backoff.h
+++ b/libcxx/include/__thread/poll_with_backoff.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___THREAD_POLL_WITH_BACKOFF_H
 #define _LIBCPP___THREAD_POLL_WITH_BACKOFF_H
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/high_resolution_clock.h>
 #include <__config>
diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort
index 259c70dda8fe83..1e2265a6bf7558 100644
--- a/libcxx/include/__verbose_abort
+++ b/libcxx/include/__verbose_abort
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___VERBOSE_ABORT
 #define _LIBCPP___VERBOSE_ABORT
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index a6b4d2288309e3..bce67bb5d34250 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -54,7 +54,6 @@ namespace std
 #include <__assert>
 #include <__atomic/atomic_base.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__memory/unique_ptr.h>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/timed_backoff_policy.h>
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index 4ded1140d46b1b..5195cd6057dd33 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -118,7 +118,6 @@ public:
 
 */
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/steady_clock.h>
 #include <__chrono/time_point.h>
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 3c33e04e9f05f8..555761aae6afd2 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -189,7 +189,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/remove_if.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__debug_utils/sanitizers.h>
 #include <__format/enable_insertable.h>
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 80dd49fe3d75a0..363931e3f23881 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -199,7 +199,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
-#include <__availability>
 #include <__config>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 7128f72e161193..18f4dd3eed0b23 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -188,7 +188,6 @@ typedef basic_fstream<wchar_t> wfstream;
 
 #include <__algorithm/max.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__fwd/fstream.h>
 #include <__locale>
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 1937617f7dcc61..da8dae149c79f3 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -50,7 +50,6 @@ namespace std
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <cstddef>
 #include <limits>
 #include <version>
diff --git a/libcxx/include/list b/libcxx/include/list
index 610a24e384600e..87f15e144ac8f2 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -203,7 +203,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__format/enable_insertable.h>
 #include <__iterator/distance.h>
diff --git a/libcxx/include/map b/libcxx/include/map
index 1d1c062a0267c0..7efa715e84aa7e 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -575,7 +575,6 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/binary_function.h>
 #include <__functional/is_transparent.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 1f7c2a183f63d0..48391b2a12095d 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -570,10 +570,6 @@ module std_private_assert            [system] {
   header "__assert"
   export *
 }
-module std_private_availability      [system] {
-  header "__availability"
-  export *
-}
 module std_private_bit_reference     [system] {
   header "__bit_reference"
   export *
@@ -584,7 +580,9 @@ module std_private_fwd_bit_reference [system] {
 module std_private_config            [system] {
   textual header "__config"
   textual header "__configuration/abi.h"
+  textual header "__configuration/availability.h"
   textual header "__configuration/compiler.h"
+  textual header "__configuration/language.h"
   textual header "__configuration/platform.h"
   export *
 }
diff --git a/libcxx/include/optional b/libcxx/include/optional
index a16e48502e2509..622e150f7a9f7c 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -178,7 +178,6 @@ namespace std {
 */
 
 #include <__assert>
-#include <__availability>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
 #include <__concepts/invocable.h>
diff --git a/libcxx/include/print b/libcxx/include/print
index e0bcf214ea239b..5bdaa559af7242 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -34,7 +34,6 @@ namespace std {
 */
 
 #include <__assert>
-#include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__system_error/system_error.h>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index ce9f34260254a0..b3869d36de1dfb 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -792,7 +792,6 @@ typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
 #include <__algorithm/find.h>
 #include <__algorithm/search.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__iterator/back_insert_iterator.h>
 #include <__iterator/default_sentinel.h>
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index cb2f42c106ca85..8d3b04475c092d 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -55,7 +55,6 @@ using binary_semaphore = counting_semaphore<1>;
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/support.h>
diff --git a/libcxx/include/set b/libcxx/include/set
index d9377ee6c33224..ab3a4363499af9 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -516,7 +516,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 5009fe5c0057be..9ba43ffeb850f2 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -312,7 +312,6 @@ typedef basic_stringstream<wchar_t> wstringstream;
 
 // clang-format on
 
-#include <__availability>
 #include <__config>
 #include <__fwd/sstream.h>
 #include <__ostream/basic_ostream.h>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index c838cd96b1123e..2e25b0f0506956 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -585,7 +585,6 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 
 #include <__algorithm/is_permutation.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 5de1458beb1e6a..c966cc8eb4df1b 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -533,7 +533,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 
 #include <__algorithm/is_permutation.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 631ffceab5f68f..7ebd0534b16414 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -212,7 +212,6 @@ namespace std {
 
 */
 
-#include <__availability>
 #include <__compare/common_comparison_category.h>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index b190557fb7b7e8..cbfc2cefa1fd93 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -316,7 +316,6 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__algorithm/rotate.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__assert>
-#include <__availability>
 #include <__bit_reference>
 #include <__concepts/same_as.h>
 #include <__config>
diff --git a/libcxx/include/version b/libcxx/include/version
index 140a9a0d870360..d433e1b1c9cea0 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -255,7 +255,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 
 */
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp
index 6ba63f2d89f5a5..62b474a312be2d 100644
--- a/libcxx/src/optional.cpp
+++ b/libcxx/src/optional.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <__availability>
 #include <optional>
 #include <stdexcept>
 
diff --git a/libcxx/src/ostream.cpp b/libcxx/src/ostream.cpp
index 443dce9a390bee..e1a9a4bc1de718 100644
--- a/libcxx/src/ostream.cpp
+++ b/libcxx/src/ostream.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <__availability>
 #include <__config>
 #ifndef _LIBCPP_HAS_NO_FILESYSTEM
 #  include <fstream>
diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
index c55a0a4d6e5d1b..60723bf7b6e971 100644
--- a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
+++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
@@ -15,7 +15,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -pedantic-errors
 
-#include <__availability>
+#include <__config>
 
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 #  error Availability annotations should be enabled on Apple platforms in the system configuration!
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 1e79f6c140758c..490ecefc975222 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -61,7 +61,8 @@ def add_version_header(tc):
 #                   just libc++. It may depend on
 #                    * macros defined by the compiler itself, or
 #                    * macros generated by CMake.
-#                   In some cases we add also depend on macros defined in <__availability>.
+#                   In some cases we add also depend on macros defined in
+#                   <__configuration/availability.h>.
 # libcxx_guard      An optional string field. When this field is provided,
 #                   `test_suite_guard` must also be provided. This field is used
 #                   only to guard the feature-test macro in <version>. It may
@@ -1562,7 +1563,6 @@ def produce_version_header():
 
 */
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)

From 633ea41b54bf7b2f10850bbd5ba3c4ab06081595 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 28 May 2024 18:29:47 -0700
Subject: [PATCH 052/230] [runtimes] Reintroduce a way to select the compiler
 used for the test suite (#93542)

A while back, the cxx_under_test Lit parameter was removed. This patch
reintroduces a Lit parameter called "compiler" which controls the value
of the %{cxx} substitution used in the test suite.

To run the test suite with a different compiler, one can now pass
--param compiler=<path>.
---
 libcxx/test/CMakeLists.txt                 | 2 ++
 libcxx/test/configs/cmake-bridge.cfg.in    | 2 --
 libcxx/utils/libcxx/test/params.py         | 8 ++++++++
 libcxxabi/test/CMakeLists.txt              | 2 ++
 libcxxabi/test/configs/cmake-bridge.cfg.in | 1 -
 libunwind/test/CMakeLists.txt              | 2 ++
 libunwind/test/configs/cmake-bridge.cfg.in | 2 --
 7 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index fd57aa9fe8b375..ee3502d32f7ae5 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -16,6 +16,8 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (NOT LIBCXX_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 84b3270a8940ac..78d0cb5a257488 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -23,8 +23,6 @@ config.recursiveExpansionLimit = 10
 config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 
 # Add substitutions for bootstrapping the test suite configuration
-import shlex
-config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
 config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
 config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
 config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index c2d294e49f4884..4c8590a2135d9e 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -143,6 +143,14 @@ def getSuitableClangTidy(cfg):
 
 # fmt: off
 DEFAULT_PARAMETERS = [
+    Parameter(
+        name="compiler",
+        type=str,
+        help="The path of the compiler to use for testing.",
+        actions=lambda cxx: [
+            AddSubstitution("%{cxx}", shlex.quote(cxx)),
+        ],
+    ),
     Parameter(
         name="target_triple",
         type=str,
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 586927189cf1dd..cd908a3514cb27 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -24,6 +24,8 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (NOT LIBCXXABI_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in
index 1d0f51d37437bd..3fefc6a7fdc88a 100644
--- a/libcxxabi/test/configs/cmake-bridge.cfg.in
+++ b/libcxxabi/test/configs/cmake-bridge.cfg.in
@@ -26,7 +26,6 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 # TODO: This is a non-standard Lit attribute and we should have another way of accessing this.
 config.host_triple = '@LLVM_HOST_TRIPLE@'
 
-config.substitutions.append(('%{cxx}', '@CMAKE_CXX_COMPILER@'))
 config.substitutions.append(('%{libcxx}', '@LIBCXXABI_LIBCXX_PATH@'))
 config.substitutions.append(('%{include}', '@LIBCXXABI_SOURCE_DIR@/include'))
 config.substitutions.append(('%{cxx-include}', '@LIBCXXABI_HEADER_DIR@/include/c++/v1'))
diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt
index 21dfbb0a84f0a8..bd2e575f2a296a 100644
--- a/libunwind/test/CMakeLists.txt
+++ b/libunwind/test/CMakeLists.txt
@@ -15,6 +15,8 @@ pythonize_bool(LIBUNWIND_USES_ARM_EHABI)
 set(AUTO_GEN_COMMENT "## Autogenerated by libunwind configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (LIBUNWIND_EXECUTOR)
   message(DEPRECATION "LIBUNWIND_EXECUTOR is deprecated, please add executor=... to LIBUNWIND_TEST_PARAMS")
   serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBUNWIND_EXECUTOR}")
diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index c5f34c87abb92a..7fc7a3da424629 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -29,7 +29,5 @@ if not @LIBUNWIND_ENABLE_THREADS@:
     config.available_features.add('libunwind-no-threads')
 
 # Add substitutions for bootstrapping the test suite configuration
-import shlex
-config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
 config.substitutions.append(('%{include}', '@LIBUNWIND_SOURCE_DIR@/include'))
 config.substitutions.append(('%{lib}', '@LIBUNWIND_LIBRARY_DIR@'))

From bd135c3b9fb57e6346e4a790945809617388ca9b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 28 May 2024 18:31:01 -0700
Subject: [PATCH 053/230] [runtimes][CMake] Simplify the propagation of test
 dependencies (#93558)

Instead of using FOO_TEST_DEPS global variables that don't get updated
properly from subdirectories, use targets to propagate the dependencies
across directories.
---
 libcxx/CMakeLists.txt                              |  7 +++----
 libcxx/benchmarks/CMakeLists.txt                   |  6 +-----
 libcxx/modules/CMakeLists.txt                      |  1 +
 libcxx/src/CMakeLists.txt                          |  2 ++
 libcxx/test/CMakeLists.txt                         | 14 --------------
 libcxx/test/tools/clang_tidy_checks/CMakeLists.txt |  2 ++
 libcxxabi/CMakeLists.txt                           |  3 +++
 libcxxabi/src/CMakeLists.txt                       |  1 +
 libcxxabi/test/CMakeLists.txt                      | 13 +------------
 libunwind/test/CMakeLists.txt                      |  2 +-
 10 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bbde9abc57919e..a061fda88b5c62 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -856,15 +856,14 @@ endfunction()
 #===============================================================================
 # Setup Source Code And Tests
 #===============================================================================
+add_custom_target(cxx-test-depends
+  COMMENT "Build dependencies required to run the libc++ test suite.")
+
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
 add_subdirectory(modules)
 
-set(LIBCXX_TEST_DEPS "cxx_experimental")
-
-list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules)
-
 if (LIBCXX_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 93b549a316e385..2101f9c71788c1 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -252,10 +252,6 @@ endforeach()
 if (LIBCXX_INCLUDE_TESTS)
   include(AddLLVM)
 
-  if (NOT DEFINED LIBCXX_TEST_DEPS)
-    message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined")
-  endif()
-
   configure_lit_site_cfg(
           ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
           ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py)
@@ -265,6 +261,6 @@ if (LIBCXX_INCLUDE_TESTS)
   add_lit_target(check-cxx-benchmarks
           "Running libcxx benchmarks tests"
           ${CMAKE_CURRENT_BINARY_DIR}
-          DEPENDS cxx-benchmarks ${LIBCXX_TEST_DEPS}
+          DEPENDS cxx-benchmarks cxx-test-depends
           ARGS ${BENCHMARK_LIT_ARGS})
 endif()
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index d47d19a4755317..82cd7b66beb7a9 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -202,6 +202,7 @@ add_custom_target(generate-cxx-modules
   ALL DEPENDS
     ${_all_modules}
 )
+add_dependencies(cxx-test-depends generate-cxx-modules)
 
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 8b28d1b8918955..65e6ce2c4da43a 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -322,6 +322,7 @@ endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
+add_dependencies(cxx-test-depends cxx)
 
 set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
@@ -366,6 +367,7 @@ set_target_properties(cxx_experimental
 )
 cxx_add_common_build_flags(cxx_experimental)
 target_compile_options(cxx_experimental PUBLIC -D_LIBCPP_ENABLE_EXPERIMENTAL)
+add_dependencies(cxx-test-depends cxx_experimental)
 
 if (LIBCXX_INSTALL_SHARED_LIBRARY)
   install(TARGETS cxx_shared
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index ee3502d32f7ae5..3c54a4edccff38 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,11 +1,5 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
-# When the tools add clang-tidy support, the dependencies need to be updated.
-# This cannot be done in the tools CMakeLists.txt since that does not update
-# the status in this (a parent) directory.
-if(TARGET cxx-tidy)
-  list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
-endif()
 
 # By default, libcxx and libcxxabi share a library directory.
 if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
@@ -40,10 +34,6 @@ endif()
 
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
-if (NOT DEFINED LIBCXX_TEST_DEPS)
-  message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined")
-endif()
-
 if (MSVC)
   # Shared code for initializing some parameters used by all
   # llvm-libc++-*-clangcl.cfg.in test configs.
@@ -81,10 +71,6 @@ if (LIBCXX_INCLUDE_TESTS)
     ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
     MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
 
-  add_custom_target(cxx-test-depends
-    DEPENDS cxx ${LIBCXX_TEST_DEPS}
-    COMMENT "Builds dependencies required to run the test suite.")
-
   add_lit_testsuite(check-cxx
     "Running libcxx tests"
     ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 28c1dbf8aca3c1..f0289dc44c6625 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -110,3 +110,5 @@ set_target_properties(cxx-tidy PROPERTIES
 
 set_target_properties(cxx-tidy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_SHARED_MODULE_SUFFIX_CXX .plugin) # Use a portable suffix to simplify how we can find it from Lit
+
+add_dependencies(cxx-test-depends cxx-tidy)
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index f7673da25d20e0..86fe4a604f30d9 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -443,6 +443,9 @@ if (NOT "${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}" STREQUAL "")
   include_directories("${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}")
 endif()
 
+add_custom_target(cxxabi-test-depends
+  COMMENT "Build dependencies required to run the libc++abi test suite.")
+
 # Add source code. This also contains all of the logic for deciding linker flags
 # soname, etc...
 add_subdirectory(include)
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index c8cc93de50777b..c54ced4dc3ea86 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -304,6 +304,7 @@ endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxxabi DEPENDS ${LIBCXXABI_BUILD_TARGETS})
+add_dependencies(cxxabi-test-depends cxxabi cxx)
 
 if (LIBCXXABI_INSTALL_LIBRARY)
   install(TARGETS ${LIBCXXABI_INSTALL_TARGETS}
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index cd908a3514cb27..8e3048f2ffe8a1 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -10,17 +10,6 @@ endmacro()
 
 pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER)
 
-if (LIBCXXABI_ENABLE_SHARED)
-  set(LIBCXXABI_TEST_DEPS cxxabi_shared)
-else()
-  set(LIBCXXABI_TEST_DEPS cxxabi_static)
-endif()
-
-list(APPEND LIBCXXABI_TEST_DEPS cxx)
-if (LIBCXXABI_USE_LLVM_UNWINDER AND TARGET unwind)
-  list(APPEND LIBCXXABI_TEST_DEPS unwind)
-endif()
-
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
@@ -59,4 +48,4 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-cxxabi "Running libcxxabi tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ${LIBCXXABI_TEST_DEPS})
+  DEPENDS cxxabi-test-depends)
diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt
index bd2e575f2a296a..19f055f6f93ffc 100644
--- a/libunwind/test/CMakeLists.txt
+++ b/libunwind/test/CMakeLists.txt
@@ -47,4 +47,4 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-unwind "Running libunwind tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS unwind ${LIBUNWIND_TEST_DEPS})
+  DEPENDS unwind)

From 7832769d329ead264aff238c06dce086b3a74922 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 28 May 2024 19:46:23 -0600
Subject: [PATCH 054/230] Revert "[lld] Support thumb PLTs" (#93631)

Reverts llvm/llvm-project#86223

windows pre-merge is broken.
---
 lld/ELF/Arch/ARM.cpp                 | 176 ++++++++-------------------
 lld/ELF/Config.h                     |   1 -
 lld/ELF/InputFiles.cpp               |  12 --
 lld/test/ELF/armv8-thumb-plt-reloc.s | 126 -------------------
 4 files changed, 53 insertions(+), 262 deletions(-)
 delete mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 3e0efe540e1bf1..687f9499009d5e 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -231,71 +231,36 @@ static void writePltHeaderLong(uint8_t *buf) {
 // The default PLT header requires the .got.plt to be within 128 Mb of the
 // .plt in the positive direction.
 void ARM::writePltHeader(uint8_t *buf) const {
-  if (config->armThumbPLTs) {
-    // The instruction sequence for thumb:
-    //
-    // 0: b500          push    {lr}
-    // 2: f8df e008     ldr.w   lr, [pc, #0x8]          @ 0xe <func+0xe>
-    // 6: 44fe          add     lr, pc
-    // 8: f85e ff08     ldr     pc, [lr, #8]!
-    // e:               .word   .got.plt - .plt - 16
-    //
-    // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
-    // `pc` in the add instruction and 8 bytes for the `lr` adjustment.
-    //
-    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
-    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
-    write16(buf + 0, 0xb500);
-    // Split into two halves to support endianness correctly.
-    write16(buf + 2, 0xf8df);
-    write16(buf + 4, 0xe008);
-    write16(buf + 6, 0x44fe);
-    // Split into two halves to support endianness correctly.
-    write16(buf + 8, 0xf85e);
-    write16(buf + 10, 0xff08);
-    write32(buf + 12, offset);
-
-    memcpy(buf + 16, trapInstr.data(), 4);  // Pad to 32-byte boundary
-    memcpy(buf + 20, trapInstr.data(), 4);
-    memcpy(buf + 24, trapInstr.data(), 4);
-    memcpy(buf + 28, trapInstr.data(), 4);
-  } else {
-    // Use a similar sequence to that in writePlt(), the difference is the
-    // calling conventions mean we use lr instead of ip. The PLT entry is
-    // responsible for saving lr on the stack, the dynamic loader is responsible
-    // for reloading it.
-    const uint32_t pltData[] = {
-        0xe52de004, // L1: str lr, [sp,#-4]!
-        0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
-        0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
-        0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
-    };
-
-    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
-    if (!llvm::isUInt<27>(offset)) {
-      // We cannot encode the Offset, use the long form.
-      writePltHeaderLong(buf);
-      return;
-    }
-    write32(buf + 0, pltData[0]);
-    write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
-    write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
-    write32(buf + 12, pltData[3] | (offset & 0xfff));
-    memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
-    memcpy(buf + 20, trapInstr.data(), 4);
-    memcpy(buf + 24, trapInstr.data(), 4);
-    memcpy(buf + 28, trapInstr.data(), 4);
+  // Use a similar sequence to that in writePlt(), the difference is the calling
+  // conventions mean we use lr instead of ip. The PLT entry is responsible for
+  // saving lr on the stack, the dynamic loader is responsible for reloading
+  // it.
+  const uint32_t pltData[] = {
+      0xe52de004, // L1: str lr, [sp,#-4]!
+      0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
+      0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
+      0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
+  };
+
+  uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
+  if (!llvm::isUInt<27>(offset)) {
+    // We cannot encode the Offset, use the long form.
+    writePltHeaderLong(buf);
+    return;
   }
+  write32(buf + 0, pltData[0]);
+  write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
+  write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
+  write32(buf + 12, pltData[3] | (offset & 0xfff));
+  memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
+  memcpy(buf + 20, trapInstr.data(), 4);
+  memcpy(buf + 24, trapInstr.data(), 4);
+  memcpy(buf + 28, trapInstr.data(), 4);
 }
 
 void ARM::addPltHeaderSymbols(InputSection &isec) const {
-  if (config->armThumbPLTs) {
-    addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
-    addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
-  } else {
-    addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
-    addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
-  }
+  addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
+  addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
 }
 
 // Long form PLT entries that do not have any restrictions on the displacement
@@ -314,65 +279,32 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
 // .plt in the positive direction.
 void ARM::writePlt(uint8_t *buf, const Symbol &sym,
                    uint64_t pltEntryAddr) const {
+  // The PLT entry is similar to the example given in Appendix A of ELF for
+  // the Arm Architecture. Instead of using the Group Relocations to find the
+  // optimal rotation for the 8-bit immediate used in the add instructions we
+  // hard code the most compact rotations for simplicity. This saves a load
+  // instruction over the long plt sequences.
+  const uint32_t pltData[] = {
+      0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
+      0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
+      0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
+  };
 
-  if (!config->armThumbPLTs) {
-    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
-
-    // The PLT entry is similar to the example given in Appendix A of ELF for
-    // the Arm Architecture. Instead of using the Group Relocations to find the
-    // optimal rotation for the 8-bit immediate used in the add instructions we
-    // hard code the most compact rotations for simplicity. This saves a load
-    // instruction over the long plt sequences.
-    const uint32_t pltData[] = {
-        0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
-        0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
-        0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
-    };
-    if (!llvm::isUInt<27>(offset)) {
-      // We cannot encode the Offset, use the long form.
-      writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
-      return;
-    }
-    write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
-    write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
-    write32(buf + 8, pltData[2] | (offset & 0xfff));
-    memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
-  } else {
-    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
-    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
-
-    // A PLT entry will be:
-    //
-    //       movw ip, #<lower 16 bits>
-    //       movt ip, #<upper 16 bits>
-    //       add ip, pc
-    //   L1: ldr.w pc, [ip]
-    //       b L1
-    //
-    // where ip = r12 = 0xc
-
-    // movw ip, #<lower 16 bits>
-    write16(buf + 2, 0x0c00); // use `ip`
-    relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);
-
-    // movt ip, #<upper 16 bits>
-    write16(buf + 6, 0x0c00); // use `ip`
-    relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);
-
-    write16(buf + 8, 0x44fc);       // add ip, pc
-    write16(buf + 10, 0xf8dc);      // ldr.w   pc, [ip] (bottom half)
-    write16(buf + 12, 0xf000);      // ldr.w   pc, [ip] (upper half)
-    write16(buf + 14, 0xe7fc);      // Branch to previous instruction
+  uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
+  if (!llvm::isUInt<27>(offset)) {
+    // We cannot encode the Offset, use the long form.
+    writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
+    return;
   }
+  write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
+  write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
+  write32(buf + 8, pltData[2] | (offset & 0xfff));
+  memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
 }
 
 void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
-  if (config->armThumbPLTs) {
-    addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
-  } else {
-    addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
-    addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
-  }
+  addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
+  addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -393,8 +325,6 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   case R_ARM_JUMP24:
     // Source is ARM, all PLT entries are ARM so no interworking required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
-    assert(!config->armThumbPLTs &&
-           "If the source is ARM, we should not need Thumb PLTs");
     if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
       return true;
     [[fallthrough]];
@@ -405,9 +335,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   }
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
-    // Source is Thumb, when all PLT entries are ARM interworking is required.
+    // Source is Thumb, all PLT entries are ARM so interworking is required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
-    if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
+    if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
       return true;
     [[fallthrough]];
   case R_ARM_THM_CALL: {
@@ -617,6 +547,7 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // STT_FUNC we choose whether to write a BL or BLX depending on the
     // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
     // not of type STT_FUNC then we must preserve the original instruction.
+    // PLT entries are always ARM state so we know we don't need to interwork.
     assert(rel.sym); // R_ARM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
     bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -675,13 +606,12 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // PLT entries are always ARM state so we know we need to interwork.
     assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
-    bool useThumb = bit0Thumb || config->armThumbPLTs;
     bool isBlx = (read16(loc + 2) & 0x1000) == 0;
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
-    // even when type not STT_FUNC.
-    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
+    // even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
+    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
       stateChangeWarning(loc, rel.type, *rel.sym);
-    if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
+    if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
       // before overflow check.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 883c4a2f84294c..f0dfe7f377de0e 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -217,7 +217,6 @@ struct Config {
   bool allowMultipleDefinition;
   bool fatLTOObjects;
   bool androidPackDynRelocs = false;
-  bool armThumbPLTs = false;
   bool armHasBlx = false;
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index d760dddcf5ec5c..1f496026d3ae20 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -194,18 +194,6 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
   if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
       profile == ARMBuildAttrs::MicroControllerProfile)
     config->armCMSESupport = true;
-
-  // The thumb PLT entries require Thumb2 which can be used on multiple archs.
-  // For now, let's limit it to ones where ARM isn't available and we know have
-  // Thumb2.
-  std::optional<unsigned> armISA =
-      attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
-  std::optional<unsigned> thumb =
-      attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
-  bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
-  bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
-  if (noArmISA && hasThumb2)
-    config->armThumbPLTs = true;
 }
 
 InputFile::InputFile(Kind k, MemoryBufferRef m)
diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s
deleted file mode 100644
index 47cd5c1b741ee0..00000000000000
--- a/lld/test/ELF/armv8-thumb-plt-reloc.s
+++ /dev/null
@@ -1,126 +0,0 @@
-// REQUIRES: arm
-// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
-// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
-// RUN: ld.lld %t1 %t2 -o %t
-// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
-// RUN: ld.lld -shared %t1 %t2 -o %t.so
-// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
-// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s
-
-// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be
-// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be
-// RUN: ld.lld %t1.be %t2.be -o %t.be
-// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
-// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be
-// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
-// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
-
-// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be
-// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
-// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be
-// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
-// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
-
-/// Test PLT entry generation
- .text
- .align 2
- .globl _start
- .type  _start,%function
-_start:
- bl func1
- bl func2
- bl func3
- b.w func1
- b.w func2
- b.w func3
- beq.w func1
- beq.w func2
- beq.w func3
-
-/// Executable, expect no PLT
-// CHECK: Disassembly of section .text:
-// CHECK-EMPTY:
-// CHECK-NEXT: <func1>:
-// CHECK-NEXT:   bx      lr
-// CHECK: <func2>:
-// CHECK-NEXT:   bx      lr
-// CHECK: <func3>:
-// CHECK-NEXT:   bx      lr
-// CHECK-NEXT:   d4d4 
-// CHECK: <_start>:
-// CHECK-NEXT:   bl      {{.*}} <func1>
-// CHECK-NEXT:   bl      {{.*}} <func2>
-// CHECK-NEXT:   bl      {{.*}} <func3>
-// CHECK-NEXT:   b.w     {{.*}} <func1>
-// CHECK-NEXT:   b.w     {{.*}} <func2>
-// CHECK-NEXT:   b.w     {{.*}} <func3>
-// CHECK-NEXT:   beq.w	 {{.*}} <func1>
-// CHECK-NEXT:   beq.w	 {{.*}} <func2>
-// CHECK-NEXT:   beq.w	 {{.*}} <func3>
-
-// DSO: Disassembly of section .text:
-// DSO-EMPTY:
-// DSO-NEXT: <func1>:
-// DSO-NEXT:     bx      lr
-// DSO: <func2>:
-// DSO-NEXT:     bx      lr
-// DSO: <func3>:
-// DSO-NEXT:     bx      lr
-// DSO-NEXT:     d4d4 
-// DSO: <_start>:
-/// 0x10260 = PLT func1
-// DSO-NEXT:     bl     0x10260
-/// 0x10270 = PLT func2
-// DSO-NEXT:     bl     0x10270
-/// 0x10280 = PLT func3
-// DSO-NEXT:     bl     0x10280
-/// 0x10260 = PLT func1
-// DSO-NEXT:     b.w    0x10260
-/// 0x10270 = PLT func2
-// DSO-NEXT:     b.w    0x10270
-/// 0x10280 = PLT func3
-// DSO-NEXT:     b.w    0x10280
-/// 0x10260 = PLT func1
-// DSO-NEXT:     beq.w	 0x10260
-/// 0x10270 = PLT func2
-// DSO-NEXT:     beq.w	 0x10270
-/// 0x10280 = PLT func3
-// DSO-NEXT:     beq.w	 0x10280
-// DSO: Disassembly of section .plt:
-// DSO-EMPTY:
-// DSO-NEXT: 10240 <.plt>:
-// DSO-NEXT:     push    {lr}
-// DSO-NEXT:     ldr.w   lr, [pc, #8]
-// DSO-NEXT:     add     lr, pc
-// DSO-NEXT:     ldr     pc, [lr, #8]!
-/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
-// DSO-NEXT:     .word   0x00020098
-// DSO-NEXT:     .word   0xd4d4d4d4
-// DSO-NEXT:     .word   0xd4d4d4d4
-// DSO-NEXT:     .word   0xd4d4d4d4
-// DSO-NEXT:     .word   0xd4d4d4d4
-
-/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1
-// DSO-NEXT:     10260:       f240 0c88     movw    r12, #136
-// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
-// DSO-NEXT:                  44fc          add     r12, pc
-// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
-// DSO-NEXT:                  e7fc          b       0x1026a
-/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2
-// DSO-NEXT:     10270:       f240 0c7c     movw    r12, #124
-// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
-// DSO-NEXT:                  44fc          add     r12, pc
-// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
-// DSO-NEXT:                  e7fc          b       0x1027a
-/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3
-// DSO-NEXT:     10280:       f240 0c70     movw    r12, #112
-// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
-// DSO-NEXT:                  44fc          add     r12, pc
-// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
-// DSO-NEXT:                  e7fc          b       0x1028a
-
-// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00  WA  0   0  4
-// DSOREL: Relocation section '.rel.plt'
-// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1
-// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2
-// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3

From c250aeb9d6c590d9fdbebd84fc259c4e536dace9 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Mon, 27 May 2024 19:33:53 +0900
Subject: [PATCH 055/230] [AMDGPU] Fix typo in VIMAGE no sampler opcode usage
 (NFCI)

Opcodes are the same for these instructions in GFX11 and 12,
hence this typo has no functional impact.
---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 351263d079768b..24f9a6e375baaf 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -494,7 +494,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
 class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
                              RegisterClass DataRC, int num_addrs,
                              string dns="">
-  : VIMAGE_gfx12<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
+  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),

From cbf6e93ceee7b9de2b7c3e7e8cea3a972eda0e75 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 28 May 2024 20:47:49 -0700
Subject: [PATCH 056/230] [clang codegen] Delete unnecessary GEP cleanup code.
 (#90303)

There's some code in AggExprEmitter::VisitCXXParenListOrInitListExpr to
try to do early cleanup for GEPs for fields that aren't accessed. But
it's unlikely to actually save significant compile-time, and it's subtly
wrong in cases where EmitLValueForFieldInitialization() doesn't create a
GEP. So just delete the code.

Fixes #88077. Fixes #89547.
---
 clang/lib/CodeGen/CGExprAgg.cpp             | 10 ---------
 clang/test/CodeGenCXX/no-unique-address.cpp | 25 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index bba00257fd4f0a..7a92fc3dfb4a43 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1789,7 +1789,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     // Push a destructor if necessary.
     // FIXME: if we have an array of structures, all explicitly
     // initialized, we can end up pushing a linear number of cleanups.
-    bool pushedCleanup = false;
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
@@ -1797,17 +1796,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         CGF.pushDestroyAndDeferDeactivation(NormalAndEHCleanup, LV.getAddress(),
                                             field->getType(),
                                             CGF.getDestroyer(dtorKind), false);
-        pushedCleanup = true;
       }
     }
-
-    // If the GEP didn't get used because of a dead zero init or something
-    // else, clean it up for -O0 builds and general tidiness.
-    if (!pushedCleanup && LV.isSimple())
-      if (llvm::GetElementPtrInst *GEP =
-              dyn_cast<llvm::GetElementPtrInst>(LV.emitRawPointer(CGF)))
-        if (GEP->use_empty())
-          GEP->eraseFromParent();
   }
 }
 
diff --git a/clang/test/CodeGenCXX/no-unique-address.cpp b/clang/test/CodeGenCXX/no-unique-address.cpp
index 7b4bbbf2a05d51..82532c5e1be82a 100644
--- a/clang/test/CodeGenCXX/no-unique-address.cpp
+++ b/clang/test/CodeGenCXX/no-unique-address.cpp
@@ -101,3 +101,28 @@ struct HasZeroSizedFieldWithNonTrivialInit {
 HasZeroSizedFieldWithNonTrivialInit testHasZeroSizedFieldWithNonTrivialInit = {.a = 1};
 // CHECK-LABEL: define {{.*}}cxx_global_var_init
 // CHECK: call {{.*}}@_ZN14NonTrivialInitC1Ev({{.*}}@testHasZeroSizedFieldWithNonTrivialInit
+
+void *operator new(unsigned long, void *);
+template <class Ty>
+struct _box {
+  [[no_unique_address]] Ty _value;
+};
+// Make sure this doesn't crash.
+// CHECK-LABEL: define {{.*}}placement_new_struct
+void placement_new_struct() {
+  struct set_value_t {};
+
+  // GH88077
+  struct _tuple : _box<set_value_t>, _box<int> {};
+
+  int _storage[1];
+  new (_storage) _tuple{};
+
+  // GH89547
+  struct _tuple2 {
+    _box<set_value_t> a;
+  };
+
+  int _storage2[1];
+  new (_storage2) _tuple2{};
+}

From bb42511f64fd44f2ff1beb0dd38a653a8f2c20df Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Wed, 29 May 2024 12:58:44 +0800
Subject: [PATCH 057/230] [Clang][Sema] Use StructuralValues to model dependent
 NTTP arguments (#93556)

This patch takes Richard's approach of no longer modeling dependent NTTP
arguments with TemplateParamObjectDecls. Clang used to do so, which left
behind a problem in that we might mess up dependent and non-dependent
arguments that boil down to the same canonical type because there's a
default argument on the NTTP.

The problem of "canonical expression" is still present because this
patch doesn't touch the profiling part. Namely, #92292 seems different.

Fixes https://github.com/llvm/llvm-project/issues/84052
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/AST/TemplateBase.cpp                |  7 +++++-
 .../SemaTemplate/temp_arg_nontype_cxx2c.cpp   | 23 ++++++++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9091f6341bd9b8..bd92818f0c09d0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -810,6 +810,7 @@ Bug Fixes to C++ Support
 - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
 - Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
 - Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536).
+- Clang no longer models dependent NTTP arguments as ``TemplateParamObjectDecl`` s. Fixes (#GH84052).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index a7ee973b7f7d06..b50daf5fbed6a7 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -221,8 +221,13 @@ static const ValueDecl *getAsSimpleValueDeclRef(const ASTContext &Ctx,
 
   // We model class non-type template parameters as their template parameter
   // object declaration.
-  if (V.isStruct() || V.isUnion())
+  if (V.isStruct() || V.isUnion()) {
+    // Dependent types are not supposed to be described as
+    // TemplateParamObjectDecls.
+    if (T->isDependentType() || T->isInstantiationDependentType())
+      return nullptr;
     return Ctx.getTemplateParamObjectDecl(T, V);
+  }
 
   // Pointers and references with an empty path use the special 'Declaration'
   // representation.
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
index 9fb6b440b6b2af..e74c031eba4c1c 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++20 -Wconversion -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++2c -Wconversion -verify %s
 
 struct Test {
     int a = 0;
@@ -102,3 +102,24 @@ void bar() {
 }
 
 }
+
+namespace GH84052 {
+
+template <class... T>
+concept C = sizeof(T...[1]) == 1; // #C
+
+struct A {};
+
+template <class T, C<T> auto = A{}> struct Set {}; // #Set
+
+template <class T> void foo() {
+  Set<T> unrelated;
+}
+
+Set<bool> sb;
+Set<float> sf;
+// expected-error@-1 {{constraints not satisfied for class template 'Set'}}
+// expected-note@#Set {{because 'C<decltype(GH84052::A{}), float>' evaluated to false}}
+// expected-note@#C {{evaluated to false}}
+
+} // namespace GH84052

From 465bc5e729fd755880b9a288de42a37ad1206301 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 29 May 2024 07:05:55 +0200
Subject: [PATCH 058/230] AArch64/ARM/PPC/X86: Add some atomic tests (#92933)

FP typed atomic load/store coverage was mostly missing, especially for
half and bfloat.
---
 .../Atomics/aarch64-atomic-load-lse2.ll       | 113 ++++
 .../CodeGen/AArch64/relaxed-fp-atomics.ll     |  90 +++
 llvm/test/CodeGen/ARM/atomic-load-store.ll    | 536 ++++++++++++++++++
 llvm/test/CodeGen/PowerPC/atomics.ll          | 209 +++++++
 llvm/test/CodeGen/X86/atomic-non-integer.ll   |  97 ++++
 5 files changed, 1045 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
index e7e231bc344d92..3732d4feb0c67b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
@@ -566,6 +566,119 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt
     %r = load atomic i128, ptr %ptr seq_cst, align 1
     ret i128 %r
 }
+
+define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr unordered, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr unordered, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr monotonic, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr monotonic, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr acquire, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr acquire, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr seq_cst, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr seq_cst, align 2
+    ret half %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr unordered, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr unordered, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr monotonic, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr monotonic, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr acquire, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr acquire, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+    ret bfloat %r
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; -O0: {{.*}}
 ; -O1: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
index 95abbb6979be89..af664549a472a9 100644
--- a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
+++ b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
@@ -91,4 +91,94 @@ define void @atomic_store_relaxed_f64(ptr %p, i32 %off32, i64 %off64, double %va
   ret void
 }
 
+define half @atomic_load_relaxed_f16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_f16:
+  %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+  %val_unsigned = load atomic half, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+  %val_regoff = load atomic half, ptr %ptr_regoff unordered, align 4
+  %tot1 = fadd half %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+  %val_regoff64 = load atomic half, ptr %ptr_regoff64 monotonic, align 4
+  %tot2 = fadd half %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+  %val_unscaled = load atomic half, ptr %ptr_unscaled unordered, align 4
+  %tot3 = fadd half %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+  ret half %tot3
+}
+
+define bfloat @atomic_load_relaxed_bf16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_bf16:
+  %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+  %val_unsigned = load atomic bfloat, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+  %val_regoff = load atomic bfloat, ptr %ptr_regoff unordered, align 4
+  %tot1 = fadd bfloat %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+  %val_regoff64 = load atomic bfloat, ptr %ptr_regoff64 monotonic, align 4
+  %tot2 = fadd bfloat %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+  %val_unscaled = load atomic bfloat, ptr %ptr_unscaled unordered, align 4
+  %tot3 = fadd bfloat %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+  ret bfloat %tot3
+}
+
+define void @atomic_store_relaxed_f16(ptr %p, i32 %off32, i64 %off64, half %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_f16:
+  %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+  store atomic half %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+  store atomic half %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+  store atomic half %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+  store atomic half %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_bf16(ptr %p, i32 %off32, i64 %off64, bfloat %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_bf16:
+  %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+  store atomic bfloat %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+  store atomic bfloat %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+  store atomic bfloat %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+  store atomic bfloat %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 4f2e63b5f24676..c53fb2f330a792 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -439,3 +439,539 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
   store atomic i64 %v, ptr %p seq_cst, align 8
   ret void
 }
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrh r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrh r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrh r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrh r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldrh r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic half, ptr %ptr seq_cst, align 2
+  ret half %val
+}
+
+define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_bf16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrh r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_bf16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrh r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_bf16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrh r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_bf16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_bf16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_bf16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrh r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_bf16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldrh r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+  ret bfloat %val
+}
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f32__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldr r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f32__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldr r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f32__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldr r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f32__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f32__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_4
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f32__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldr r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f32__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldr r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f64__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrexd r0, r1, [r0]
+; ARM-NEXT:    clrex
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f64__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrexd r2, r3, [r0]
+; ARMOPTNONE-NEXT:    mov r1, r3
+; ARMOPTNONE-NEXT:    mov r0, r2
+; ARMOPTNONE-NEXT:    clrex
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    vmov d16, r0, r1
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f64__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrexd r0, r1, [r0]
+; THUMBTWO-NEXT:    clrex
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f64__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    sub sp, #8
+; THUMBONE-NEXT:    movs r2, #0
+; THUMBONE-NEXT:    str r2, [sp]
+; THUMBONE-NEXT:    str r2, [sp, #4]
+; THUMBONE-NEXT:    mov r3, r2
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_8
+; THUMBONE-NEXT:    add sp, #8
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f64__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_8
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f64__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrexd r0, r1, [r0]
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f64__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    push {r7, lr}
+; THUMBM-NEXT:    movs r1, #5
+; THUMBM-NEXT:    bl __atomic_load_8
+; THUMBM-NEXT:    pop {r7, pc}
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; ARM-LABEL: store_atomic_f16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    strh r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    strh r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    strh r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    strh r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_f16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    strh r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic half %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+; ARM-LABEL: store_atomic_bf16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    strh r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_bf16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    strh r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_bf16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    strh r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_bf16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_bf16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_bf16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    strh r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_bf16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    strh r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; ARM-LABEL: store_atomic_f32__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    str r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f32__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    str r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f32__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    str r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f32__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_4
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f32__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_4
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f32__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    str r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_f32__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    str r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic float %val1, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; ARM-LABEL: store_atomic_f64__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    push {r4, r5, lr}
+; ARM-NEXT:    mov r3, r2
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    mov r2, r1
+; ARM-NEXT:  LBB13_1: @ %atomicrmw.start
+; ARM-NEXT:    @ =>This Inner Loop Header: Depth=1
+; ARM-NEXT:    ldrexd r4, r5, [r0]
+; ARM-NEXT:    strexd r1, r2, r3, [r0]
+; ARM-NEXT:    cmp r1, #0
+; ARM-NEXT:    bne LBB13_1
+; ARM-NEXT:  @ %bb.2: @ %atomicrmw.end
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    pop {r4, r5, pc}
+;
+; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    push {r4, r5, r7, lr}
+; ARMOPTNONE-NEXT:    add r7, sp, #8
+; ARMOPTNONE-NEXT:    push {r8, r10, r11}
+; ARMOPTNONE-NEXT:    sub sp, sp, #20
+; ARMOPTNONE-NEXT:    str r0, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    vmov d16, r1, r2
+; ARMOPTNONE-NEXT:    vmov r1, r2, d16
+; ARMOPTNONE-NEXT:    str r2, [sp, #4] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    ldr r1, [r0]
+; ARMOPTNONE-NEXT:    ldr r0, [r0, #4]
+; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    b LBB13_1
+; ARMOPTNONE-NEXT:  LBB13_1: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ =>This Loop Header: Depth=1
+; ARMOPTNONE-NEXT:    @ Child Loop BB13_2 Depth 2
+; ARMOPTNONE-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r10, [sp, #8] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    @ kill: def $r10 killed $r10 def $r10_r11
+; ARMOPTNONE-NEXT:    mov r11, r0
+; ARMOPTNONE-NEXT:    mov r8, r2
+; ARMOPTNONE-NEXT:    mov r9, r1
+; ARMOPTNONE-NEXT:  LBB13_2: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ Parent Loop BB13_1 Depth=1
+; ARMOPTNONE-NEXT:    @ => This Inner Loop Header: Depth=2
+; ARMOPTNONE-NEXT:    ldrexd r4, r5, [r3]
+; ARMOPTNONE-NEXT:    cmp r4, r8
+; ARMOPTNONE-NEXT:    cmpeq r5, r9
+; ARMOPTNONE-NEXT:    bne LBB13_4
+; ARMOPTNONE-NEXT:  @ %bb.3: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ in Loop: Header=BB13_2 Depth=2
+; ARMOPTNONE-NEXT:    strexd r0, r10, r11, [r3]
+; ARMOPTNONE-NEXT:    cmp r0, #0
+; ARMOPTNONE-NEXT:    bne LBB13_2
+; ARMOPTNONE-NEXT:  LBB13_4: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ in Loop: Header=BB13_1 Depth=1
+; ARMOPTNONE-NEXT:    mov r0, r5
+; ARMOPTNONE-NEXT:    eor r3, r0, r1
+; ARMOPTNONE-NEXT:    mov r1, r4
+; ARMOPTNONE-NEXT:    eor r2, r1, r2
+; ARMOPTNONE-NEXT:    orr r2, r2, r3
+; ARMOPTNONE-NEXT:    cmp r2, #0
+; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    bne LBB13_1
+; ARMOPTNONE-NEXT:    b LBB13_5
+; ARMOPTNONE-NEXT:  LBB13_5: @ %atomicrmw.end
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    sub sp, r7, #20
+; ARMOPTNONE-NEXT:    pop {r8, r10, r11}
+; ARMOPTNONE-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMBTWO-LABEL: store_atomic_f64__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:  LBB13_1: @ %atomicrmw.start
+; THUMBTWO-NEXT:    @ =>This Inner Loop Header: Depth=1
+; THUMBTWO-NEXT:    ldrexd r3, r9, [r0]
+; THUMBTWO-NEXT:    strexd r3, r1, r2, [r0]
+; THUMBTWO-NEXT:    cmp r3, #0
+; THUMBTWO-NEXT:    bne LBB13_1
+; THUMBTWO-NEXT:  @ %bb.2: @ %atomicrmw.end
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f64__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_8
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f64__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    sub sp, sp, #8
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    str r1, [sp]
+; ARMV4-NEXT:    bl __atomic_store_8
+; ARMV4-NEXT:    add sp, sp, #8
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f64__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    push {r4, r5, r11, lr}
+; ARMV6-NEXT:    @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:  .LBB13_1: @ %atomicrmw.start
+; ARMV6-NEXT:    @ =>This Inner Loop Header: Depth=1
+; ARMV6-NEXT:    ldrexd r4, r5, [r0]
+; ARMV6-NEXT:    strexd r1, r2, r3, [r0]
+; ARMV6-NEXT:    cmp r1, #0
+; ARMV6-NEXT:    bne .LBB13_1
+; ARMV6-NEXT:  @ %bb.2: @ %atomicrmw.end
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    mcr p15, #0, r0, c7, c10, #5
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMBM-LABEL: store_atomic_f64__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    push {r7, lr}
+; THUMBM-NEXT:    sub sp, #8
+; THUMBM-NEXT:    movs r1, #5
+; THUMBM-NEXT:    str r1, [sp]
+; THUMBM-NEXT:    bl __atomic_store_8
+; THUMBM-NEXT:    add sp, #8
+; THUMBM-NEXT:    pop {r7, pc}
+  store atomic double %val1, ptr %ptr seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 04cdbe9d7e7859..ff5bec53acd257 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -462,3 +462,212 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) {
   %val = atomicrmw and ptr %mem, i64 %operand release
   ret i64 %val
 }
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f16__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    lhz r3, 0(r3)
+; PPC32-NEXT:    cmpw cr7, r3, r3
+; PPC32-NEXT:    bne- cr7, .+4
+; PPC32-NEXT:    isync
+; PPC32-NEXT:    bl __gnu_h2f_ieee
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f16__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    mflr r0
+; PPC64-NEXT:    stdu r1, -112(r1)
+; PPC64-NEXT:    std r0, 128(r1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 112
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    lhz r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    bl __gnu_h2f_ieee
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    addi r1, r1, 112
+; PPC64-NEXT:    ld r0, 16(r1)
+; PPC64-NEXT:    mtlr r0
+; PPC64-NEXT:    blr
+  %val = load atomic half, ptr %ptr seq_cst, align 2
+  ret half %val
+}
+
+; FIXME: bf16_to_fp fails to select
+; define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+;   %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+;   ret bfloat %val
+; }
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f32__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    lwz r3, 0(r3)
+; PPC32-NEXT:    cmpw cr7, r3, r3
+; PPC32-NEXT:    bne- cr7, .+4
+; PPC32-NEXT:    isync
+; PPC32-NEXT:    stw r3, 12(r1)
+; PPC32-NEXT:    lfs f1, 12(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f32__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    lwz r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    stw r3, -4(r1)
+; PPC64-NEXT:    lfs f1, -4(r1)
+; PPC64-NEXT:    blr
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f64__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r4, 5
+; PPC32-NEXT:    bl __atomic_load_8
+; PPC32-NEXT:    stw r4, 12(r1)
+; PPC32-NEXT:    stw r3, 8(r1)
+; PPC32-NEXT:    lfd f1, 8(r1)
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f64__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    ld r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    std r3, -8(r1)
+; PPC64-NEXT:    lfd f1, -8(r1)
+; PPC64-NEXT:    blr
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; PPC32-LABEL: store_atomic_f16__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    .cfi_offset r30, -8
+; PPC32-NEXT:    stw r30, 8(r1) # 4-byte Folded Spill
+; PPC32-NEXT:    mr r30, r3
+; PPC32-NEXT:    bl __gnu_f2h_ieee
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    sth r3, 0(r30)
+; PPC32-NEXT:    lwz r30, 8(r1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f16__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    mflr r0
+; PPC64-NEXT:    stdu r1, -128(r1)
+; PPC64-NEXT:    std r0, 144(r1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 128
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    .cfi_offset r30, -16
+; PPC64-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
+; PPC64-NEXT:    mr r30, r3
+; PPC64-NEXT:    bl __gnu_f2h_ieee
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    sth r3, 0(r30)
+; PPC64-NEXT:    ld r30, 112(r1) # 8-byte Folded Reload
+; PPC64-NEXT:    addi r1, r1, 128
+; PPC64-NEXT:    ld r0, 16(r1)
+; PPC64-NEXT:    mtlr r0
+; PPC64-NEXT:    blr
+  store atomic half %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+; FIXME: bf16_to_fp fails to select
+; define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+;   store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+;   ret void
+; }
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; PPC32-LABEL: store_atomic_f32__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    stfs f1, 12(r1)
+; PPC32-NEXT:    lwz r4, 12(r1)
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    stw r4, 0(r3)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f32__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    stfs f1, -4(r1)
+; PPC64-NEXT:    lwz r4, -4(r1)
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    stw r4, 0(r3)
+; PPC64-NEXT:    blr
+  store atomic float %val1, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; PPC32-LABEL: store_atomic_f64__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    stfd f1, 8(r1)
+; PPC32-NEXT:    li r7, 5
+; PPC32-NEXT:    lwz r5, 8(r1)
+; PPC32-NEXT:    lwz r6, 12(r1)
+; PPC32-NEXT:    bl __atomic_store_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f64__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    stfd f1, -8(r1)
+; PPC64-NEXT:    ld r4, -8(r1)
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    std r4, 0(r3)
+; PPC64-NEXT:    blr
+  store atomic double %val1, ptr %ptr seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 9995e7d3a4d314..d7633cb11e44c1 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -787,3 +787,100 @@ define double @load_double_seq_cst(ptr %fptr) {
   %v = load atomic double, ptr %fptr seq_cst, align 8
   ret double %v
 }
+
+define void @store_bfloat(ptr %fptr, bfloat %v) {
+; X86-LABEL: store_bfloat:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    retl
+;
+; X64-SSE-LABEL: store_bfloat:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pextrw $0, %xmm0, %eax
+; X64-SSE-NEXT:    movw %ax, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: store_bfloat:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; X64-AVX-NEXT:    movw %ax, (%rdi)
+; X64-AVX-NEXT:    retq
+  store atomic bfloat %v, ptr %fptr unordered, align 2
+  ret void
+}
+
+; Work around issue #92899 by casting to float
+define float @load_bfloat(ptr %fptr) {
+; X86-SSE1-LABEL: load_bfloat:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movzwl (%eax), %eax
+; X86-SSE1-NEXT:    shll $16, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    flds (%esp)
+; X86-SSE1-NEXT:    popl %eax
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: load_bfloat:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzwl (%eax), %eax
+; X86-SSE2-NEXT:    shll $16, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, (%esp)
+; X86-SSE2-NEXT:    flds (%esp)
+; X86-SSE2-NEXT:    popl %eax
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: load_bfloat:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzwl (%eax), %eax
+; X86-AVX-NEXT:    shll $16, %eax
+; X86-AVX-NEXT:    vmovd %eax, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, (%esp)
+; X86-AVX-NEXT:    flds (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    retl
+;
+; X86-NOSSE-LABEL: load_bfloat:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movzwl (%eax), %eax
+; X86-NOSSE-NEXT:    shll $16, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT:    retl
+;
+; X64-SSE-LABEL: load_bfloat:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movzwl (%rdi), %eax
+; X64-SSE-NEXT:    shll $16, %eax
+; X64-SSE-NEXT:    movd %eax, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: load_bfloat:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movzwl (%rdi), %eax
+; X64-AVX-NEXT:    shll $16, %eax
+; X64-AVX-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-NEXT:    retq
+  %v = load atomic bfloat, ptr %fptr unordered, align 2
+  %ext = fpext bfloat %v to float
+  ret float %ext
+}

From 3613b2683107bd60fda6d9348623be0686f6d7e3 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Wed, 29 May 2024 06:13:02 +0100
Subject: [PATCH 059/230] Constant Fold logf128 calls (#90611)

This is a second attempt to land #84501 which failed on several targets.

This patch adds the HAS_IEE754_FLOAT128 define which makes the check for
typedef'ing float128 more precise by checking whether __uint128_t is
available and checking if the host does not use __ibm128 which is
prevalent on power pc targets and replaces IEEE754 float128s.
---
 llvm/CMakeLists.txt                           |   2 +
 llvm/cmake/config-ix.cmake                    |  11 ++
 llvm/include/llvm/ADT/APFloat.h               |  13 ++
 llvm/include/llvm/ADT/APInt.h                 |   8 ++
 llvm/include/llvm/Config/llvm-config.h.cmake  |   3 +
 llvm/include/llvm/Support/float128.h          |  26 ++++
 llvm/lib/Analysis/CMakeLists.txt              |   6 +
 llvm/lib/Analysis/ConstantFolding.cpp         |  11 ++
 llvm/lib/Support/APFloat.cpp                  |  24 ++++
 llvm/test/CMakeLists.txt                      |   1 +
 .../InstSimplify/ConstProp/logf128.ll         | 126 ++++++++++++++++++
 llvm/test/lit.cfg.py                          |   3 +
 llvm/test/lit.site.cfg.py.in                  |   1 +
 13 files changed, 235 insertions(+)
 create mode 100644 llvm/include/llvm/Support/float128.h
 create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 612e90abd40913..64898ab09772f4 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,6 +560,8 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR
 
 set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON")
 
+set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 8cfb36b0194e85..0aae13e30f2ab4 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -247,6 +247,17 @@ else()
   set(HAVE_LIBEDIT 0)
 endif()
 
+if(LLVM_HAS_LOGF128)
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+
+  if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128)
+    message(FATAL_ERROR "Failed to configure logf128")
+  endif()
+
+  set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index deb74cb2fdeb1e..44a301ecc99280 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/float128.h"
 #include <memory>
 
 #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL)                             \
@@ -354,6 +355,9 @@ class IEEEFloat final : public APFloatBase {
   Expected<opStatus> convertFromString(StringRef, roundingMode);
   APInt bitcastToAPInt() const;
   double convertToDouble() const;
+#ifdef HAS_IEE754_FLOAT128
+  float128 convertToQuad() const;
+#endif
   float convertToFloat() const;
 
   /// @}
@@ -1218,6 +1222,15 @@ class APFloat : public APFloatBase {
   /// shorter semantics, like IEEEsingle and others.
   double convertToDouble() const;
 
+  /// Converts this APFloat to host float value.
+  ///
+  /// \pre The APFloat must be built using semantics, that can be represented by
+  /// the host float type without loss of precision. It can be IEEEquad and
+  /// shorter semantics, like IEEEdouble and others.
+#ifdef HAS_IEE754_FLOAT128
+  float128 convertToQuad() const;
+#endif
+
   /// Converts this APFloat to host float value.
   ///
   /// \pre The APFloat must be built using semantics, that can be represented by
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 2fd8b7ea636c4a..6cfa6ec6650842 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/float128.h"
 #include <cassert>
 #include <climits>
 #include <cstring>
@@ -1677,6 +1678,13 @@ class [[nodiscard]] APInt {
   /// any bit width. Exactly 64 bits will be translated.
   double bitsToDouble() const { return llvm::bit_cast<double>(getWord(0)); }
 
+#ifdef HAS_IEE754_FLOAT128
+  float128 bitsToQuad() const {
+    __uint128_t ul = ((__uint128_t)U.pVal[1] << 64) + U.pVal[0];
+    return llvm::bit_cast<float128>(ul);
+  }
+#endif
+
   /// Converts APInt bits to a float
   ///
   /// The conversion does not do a translation from integer to float, it just
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6605ea60df99e1..629977cc11d683 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -198,4 +198,7 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
+/* Define if logf128 is available */
+#cmakedefine LLVM_HAS_LOGF128
+
 #endif
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
new file mode 100644
index 00000000000000..e15a98dc5a6779
--- /dev/null
+++ b/llvm/include/llvm/Support/float128.h
@@ -0,0 +1,26 @@
+//===-- llvm/Support/float128.h - Compiler abstraction support --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FLOAT128
+#define LLVM_FLOAT128
+
+namespace llvm {
+
+#if defined(__clang__) && defined(__FLOAT128__) &&                             \
+    defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
+#define HAS_IEE754_FLOAT128
+typedef __float128 float128;
+#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
+    !defined(__LONG_DOUBLE_IBM128__) &&                                        \
+    (defined(__GNUC__) || defined(__GNUG__))
+#define HAS_IEE754_FLOAT128
+typedef _Float128 float128;
+#endif
+
+} // namespace llvm
+#endif // LLVM_FLOAT128
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 474b8d20fde16f..74476cb5440c61 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -159,3 +159,9 @@ add_llvm_component_library(LLVMAnalysis
   Support
   TargetParser
   )
+
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+if(HAS_LOGF128)
+ target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
+endif()
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 705377b97ed903..5febe917126b1d 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2087,6 +2087,17 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (IntrinsicID == Intrinsic::canonicalize)
       return constantFoldCanonicalize(Ty, Call, U);
 
+#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
+    if (Ty->isFP128Ty()) {
+      switch (IntrinsicID) {
+      default:
+        return nullptr;
+      case Intrinsic::log:
+        return ConstantFP::get(Ty, logf128(Op->getValueAPF().convertToQuad()));
+      }
+    }
+#endif
+
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
 
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 2a9b3903720be1..283fcc153b33aa 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3665,6 +3665,15 @@ double IEEEFloat::convertToDouble() const {
   return api.bitsToDouble();
 }
 
+#ifdef HAS_IEE754_FLOAT128
+float128 IEEEFloat::convertToQuad() const {
+  assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
+         "Float semantics are not IEEEquads");
+  APInt api = bitcastToAPInt();
+  return api.bitsToQuad();
+}
+#endif
+
 /// Integer bit is explicit in this format.  Intel hardware (387 and later)
 /// does not support these bit patterns:
 ///  exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
@@ -5260,6 +5269,21 @@ double APFloat::convertToDouble() const {
   return Temp.getIEEE().convertToDouble();
 }
 
+#ifdef HAS_IEE754_FLOAT128
+float128 APFloat::convertToQuad() const {
+  if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
+    return getIEEE().convertToQuad();
+  assert(getSemantics().isRepresentableBy(semIEEEquad) &&
+         "Float semantics is not representable by IEEEquad");
+  APFloat Temp = *this;
+  bool LosesInfo;
+  opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo);
+  assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
+  (void)St;
+  return Temp.getIEEE().convertToQuad();
+}
+#endif
+
 float APFloat::convertToFloat() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle)
     return getIEEE().convertToFloat();
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index c942339e43608e..2f466c258f6771 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -26,6 +26,7 @@ llvm_canonicalize_cmake_booleans(
   LLVM_TOOL_LLVM_DRIVER_BUILD
   LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   LLVM_APPEND_VC_REV
+  LLVM_HAS_LOGF128
   )
 
 configure_lit_site_cfg(
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll
new file mode 100644
index 00000000000000..da56997f693822
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; REQUIRES: has_logf128
+declare fp128 @llvm.log.f128(fp128)
+
+define fp128 @log_e_64(){
+; CHECK-LABEL: define fp128 @log_e_64() {
+; CHECK-NEXT:    ret fp128 0xL300000000000000040010A2B23F3BAB7
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000004005000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_positive_subnormal_number(){
+; CHECK-LABEL: define fp128 @log_e_smallest_positive_subnormal_number() {
+; CHECK-NEXT:    ret fp128 0xL3000000000000000C00C654628220780
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000010000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_subnormal_number(){
+; CHECK-LABEL: define fp128 @log_e_largest_subnormal_number() {
+; CHECK-NEXT:    ret fp128 0xLD000000000000000C00C62D918CE2421
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF0000FFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_positive_normal_number(){
+;
+; CHECK-LABEL: define fp128 @log_e_smallest_positive_normal_number() {
+; CHECK-NEXT:    ret fp128 0xLD000000000000000C00C62D918CE2421
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000001000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_normal_number(){
+; CHECK-LABEL: define fp128 @log_e_largest_normal_number() {
+; CHECK-NEXT:    ret fp128 0xLF000000000000000400C62E42FEFA39E
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_number_less_than_one(){
+; CHECK-LABEL: define fp128 @log_e_largest_number_less_than_one() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000BF8E000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF3FFEFFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_1(){
+; CHECK-LABEL: define fp128 @log_e_1() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000000000000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000003FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_number_larger_than_one(){
+; CHECK-LABEL: define fp128 @log_e_smallest_number_larger_than_one() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000003F8F000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000013FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_2(){
+; CHECK-LABEL: define fp128 @log_e_negative_2() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000C000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_0(){
+; CHECK-LABEL: define fp128 @log_e_0() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000FFFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_0(){
+; CHECK-LABEL: define fp128 @log_e_negative_0() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000FFFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000008000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_infinity(){
+; CHECK-LABEL: define fp128 @log_e_infinity() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_infinity(){
+; CHECK-LABEL: define fp128 @log_e_negative_infinity() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000FFFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_nan(){
+; CHECK-LABEL: define fp128 @log_e_nan() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000001
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000001)
+  ret fp128 %A
+}
+
+define <2 x fp128> @log_e_negative_2_vector(){
+; CHECK-LABEL: define <2 x fp128> @log_e_negative_2_vector() {
+; CHECK-NEXT:    ret <2 x fp128> <fp128 0xL00000000000000007FFF800000000000, fp128 0xL00000000000000007FFF800000000000>
+;
+  %A = call <2 x fp128> @llvm.log.v2f128(<2 x fp128> <fp128 0xL0000000000000000C000000000000000, fp128 0xL0000000000000000C000000000000001>)
+  ret <2 x fp128> %A
+}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index affd87b98c1410..fe1262893212fb 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -617,3 +617,6 @@ def have_ld64_plugin_support():
 # "OBJECT_MODE" to 'any' by default on AIX OS.
 if "system-aix" in config.available_features:
     config.environment["OBJECT_MODE"] = "any"
+
+if config.has_logf128:
+    config.available_features.add("has_logf128")
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 60a68b0edaf933..0968f6214772d0 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -63,6 +63,7 @@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
 config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
 config.have_vc_rev = @LLVM_APPEND_VC_REV@
 config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@"
+config.has_logf128 = @LLVM_HAS_LOGF128@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)

From b0f10a1dc34aa1b73faeeabdc2d348074a02c75d Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 29 May 2024 13:39:57 +0800
Subject: [PATCH 060/230] [C++20] [Modules] Don't generate the defintition for
 non-const available external variables (#93530)

Close https://github.com/llvm/llvm-project/issues/93497

The root cause of the problem is, we mark the variable from other
modules as constnant in LLVM incorrectly. This patch fixes this problem
by not emitting the defintition for non-const available external
variables. Since the non const available externally variable is not
helpful to the optimization.
---
 clang/lib/CodeGen/CodeGenModule.cpp  |  12 +++
 clang/test/CodeGenCXX/partitions.cpp |   8 +-
 clang/test/Modules/pr93497.cppm      | 106 +++++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Modules/pr93497.cppm

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e4774a587707ac..0b0b659e1fd490 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5341,6 +5341,18 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
       !IsDefinitionAvailableExternally &&
       D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
+  // It is helpless to emit the definition for an available_externally variable
+  // which can't be marked as const.
+  // We don't need to check if it needs global ctor or dtor. See the above
+  // comment for ideas.
+  if (IsDefinitionAvailableExternally &&
+      (!D->hasConstantInitialization() ||
+       // TODO: Update this when we have interface to check constexpr
+       // destructor.
+       D->needsDestruction(getContext()) ||
+       !D->getType().isConstantStorage(getContext(), true, true)))
+    return;
+
   const VarDecl *InitDecl;
   const Expr *InitExpr = D->getAnyInitializer(InitDecl);
 
diff --git a/clang/test/CodeGenCXX/partitions.cpp b/clang/test/CodeGenCXX/partitions.cpp
index d283dd071f6b28..e80e68f82974bd 100644
--- a/clang/test/CodeGenCXX/partitions.cpp
+++ b/clang/test/CodeGenCXX/partitions.cpp
@@ -40,12 +40,12 @@ export int use() {
 }
 
 // FIXME: The definition of the variables shouldn't be exported too.
-// CHECK: @_ZW3mod1a = available_externally global
-// CHECK: @_ZW3mod1b = available_externally global
+// CHECK: @_ZW3mod1a = external global
+// CHECK: @_ZW3mod1b = external global
 // CHECK: declare{{.*}} i32 @_ZW3mod3foov
 // CHECK: declare{{.*}} i32 @_ZW3mod3barv
 
-// CHECK-OPT: @_ZW3mod1a = available_externally global
-// CHECK-OPT: @_ZW3mod1b = available_externally global
+// CHECK-OPT: @_ZW3mod1a = external global
+// CHECK-OPT: @_ZW3mod1b = external global
 // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3foov
 // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3barv
diff --git a/clang/test/Modules/pr93497.cppm b/clang/test/Modules/pr93497.cppm
new file mode 100644
index 00000000000000..64a08e2a85e63e
--- /dev/null
+++ b/clang/test/Modules/pr93497.cppm
@@ -0,0 +1,106 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/mod.cppm \
+// RUN:     -emit-module-interface -o %t/mod.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/use.cpp \
+// RUN:     -fmodule-file=mod=%t/mod.pcm -emit-llvm \
+// RUN:     -o - | opt -S --passes=simplifycfg | FileCheck %t/use.cpp
+
+//--- mod.cppm
+export module mod;
+
+export struct Thing {
+    static const Thing One;
+    explicit Thing(int raw) :raw(raw) { }
+    int raw;
+};
+
+const Thing Thing::One = Thing(1);
+
+export struct C {
+    int value;
+};
+export const C ConstantValue = {1};
+
+export const C *ConstantPtr = &ConstantValue;
+
+C NonConstantValue = {1};
+export const C &ConstantRef = NonConstantValue;
+
+export struct NonConstexprDtor {
+    constexpr NonConstexprDtor(int raw) : raw(raw) {}
+    ~NonConstexprDtor();
+
+    int raw;
+};
+
+export const NonConstexprDtor NonConstexprDtorValue = {1};
+
+//--- use.cpp
+import mod;
+
+int consume(int);
+int consumeC(C);
+
+extern "C" __attribute__((noinline)) inline int unneeded() {
+    return consume(43);
+}
+
+extern "C" __attribute__((noinline)) inline int needed() {
+    return consume(43);
+}
+
+int use() {
+    Thing t1 = Thing::One;
+    return consume(t1.raw);
+}
+
+int use2() {
+    if (ConstantValue.value)
+        return consumeC(ConstantValue);
+    return unneeded();
+}
+
+int use3() {
+    auto Ptr = ConstantPtr;
+    if (Ptr->value)
+        return consumeC(*Ptr);
+    return needed();
+}
+
+int use4() {
+    auto Ref = ConstantRef;
+    if (Ref.value)
+        return consumeC(Ref);
+    return needed();
+}
+
+int use5() {
+    NonConstexprDtor V = NonConstexprDtorValue;
+    if (V.raw)
+        return consume(V.raw);
+    return needed();
+}
+
+// CHECK: @_ZNW3mod5Thing3OneE = external
+// CHECK: @_ZW3mod13ConstantValue ={{.*}}available_externally{{.*}} constant 
+// CHECK: @_ZW3mod11ConstantPtr = external
+// CHECK: @_ZW3mod16NonConstantValue = external
+// CHECK: @_ZW3mod21NonConstexprDtorValue = external
+
+// Check that the middle end can optimize the program by the constant information.
+// CHECK-NOT: @unneeded(
+
+// Check that the use of ConstantPtr won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use3v(
+// CHECK: @needed(
+
+// Check that the use of ConstantRef won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use4v(
+// CHECK: @needed(
+
+// Check that the use of NonConstexprDtorValue won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use5v(
+// CHECK: @needed(

From 70d6b8a358366ec2ef4e73d5809fe23b9abf527d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 28 May 2024 22:50:21 -0700
Subject: [PATCH 061/230] MCAsmParser: Amend \+ expansion

Amend 7c956293d856224dd6a1b633820ef53009f7ef1c ("MCAsmParser: Support
\+") to increase Macro.Count per iteration to match the new gas feature
(milestone: 2.43).
---
 llvm/lib/MC/MCParser/AsmParser.cpp                |  3 ++-
 llvm/test/MC/AsmParser/macro-at-pseudo-variable.s | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 2cddaf330b3bc5..8014ef9d039487 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2580,7 +2580,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
         OS << NumOfMacroInstantiations;
         Pos += 2;
       } else if (Argument == "+") {
-        OS << Macro.Count++;
+        OS << Macro.Count;
         Pos += 2;
       } else {
         for (; Index < NParameters; ++Index)
@@ -2629,6 +2629,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
     Body = Body.substr(Pos);
   }
 
+  ++Macro.Count;
   return false;
 }
 
diff --git a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
index a083b17aa54fe4..e1bb2298042096 100644
--- a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
+++ b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
@@ -74,15 +74,15 @@
 
 #--- b.s
 .rept 2
-  .print "r\+"
+  .print "r\+ \+"
 .endr
 .irpc foo,12
-  .print "\+i"
+  .print "\+\+i"
 .endr
-# CHECK2:      r0
-# CHECK2-NEXT: r1
-# CHECK2-NEXT: 0i
-# CHECK2-NEXT: 1i
+# CHECK2:      r0 0
+# CHECK2-NEXT: r1 1
+# CHECK2-NEXT: 00i
+# CHECK2-NEXT: 11i
 
 .rept 2
   .rept 2

From 5162027c9bb32ddbc8b37770c569a3e5a877d962 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 28 May 2024 22:47:19 -0700
Subject: [PATCH 062/230] [RISCV] Add test for #93578. NFC

---
 llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
index 7bae84142d8ae6..8dbb57fd15cf16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
@@ -934,3 +934,22 @@ define <vscale x 8 x i32> @vsra_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = ashr <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
+
+; Negative test. We shouldn't look through the vp.trunc as it isn't vlmax like
+; the rest of the code.
+define <vscale x 1 x i8> @vsra_vv_nxv1i8_sext_zext_mixed_trunc(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl) {
+; CHECK-LABEL: vsra_vv_nxv1i8_sext_zext_mixed_trunc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmin.vx v9, v8, a0
+; CHECK-NEXT:    vsra.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %sexted_va = sext <vscale x 1 x i8> %va to <vscale x 1 x i32>
+  %zexted_vb = zext <vscale x 1 x i8> %va to <vscale x 1 x i32>
+  %expand = ashr <vscale x 1 x i32> %sexted_va, %zexted_vb
+  %vc = trunc <vscale x 1 x i32> %expand to <vscale x 1 x i16>
+  %vd = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxvi16(<vscale x 1 x i16> %vc, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %vd
+}
+declare <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxvi16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32)

From 4e0bd3fab4b6a54342c9bed14f205895da3cf0d9 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Wed, 29 May 2024 14:10:01 +0800
Subject: [PATCH 063/230] [MachineLICM] Hoist copies of constant physical
 register (#93285)

Previously, we just check if the source is a virtual register and
this prevents some potential hoists.

We can see some improvements in AArch64/RISCV tests.
---
 llvm/lib/CodeGen/MachineLICM.cpp              |   5 +-
 .../AArch64/atomicrmw-uinc-udec-wrap.ll       |  10 +-
 .../AArch64/dag-combine-concat-vectors.ll     |  66 ++++----
 .../machine-sink-cache-invalidation.ll        |  10 +-
 .../AArch64/ragreedy-local-interval-cost.ll   | 148 +++++++++---------
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |   7 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |  16 +-
 llvm/test/CodeGen/AVR/shift.ll                |   6 +-
 .../RISCV/machinelicm-constant-phys-reg.ll    |  41 +++++
 .../RISCV/rvv/65704-illegal-instruction.ll    |  21 +--
 .../RISCV/rvv/fold-scalar-load-crash.ll       |  48 +++---
 llvm/test/CodeGen/RISCV/vlenb.ll              |   5 +-
 12 files changed, 218 insertions(+), 165 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 727a98c41bce4c..86eb259c090152 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1269,8 +1269,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
     Register DefReg = MI.getOperand(0).getReg();
     if (DefReg.isVirtual() &&
         all_of(MI.uses(),
-               [](const MachineOperand &UseOp) {
-                 return !UseOp.isReg() || UseOp.getReg().isVirtual();
+               [this](const MachineOperand &UseOp) {
+                 return !UseOp.isReg() || UseOp.getReg().isVirtual() ||
+                        MRI->isConstantPhysReg(UseOp.getReg());
                }) &&
         IsLoopInvariantInst(MI, CurLoop) &&
         any_of(MRI->use_nodbg_instructions(DefReg),
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
index 5f293e5c7ea34f..66fea3535b1ec3 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
@@ -55,15 +55,15 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:  .LBB3_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x0, [x8]
-; CHECK-NEXT:    cmp x0, x1
-; CHECK-NEXT:    csinc x9, xzr, x0, hs
-; CHECK-NEXT:    stlxr w10, x9, [x8]
+; CHECK-NEXT:    ldaxr x8, [x0]
+; CHECK-NEXT:    cmp x8, x1
+; CHECK-NEXT:    csinc x9, xzr, x8, hs
+; CHECK-NEXT:    stlxr w10, x9, [x0]
 ; CHECK-NEXT:    cbnz w10, .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst
   ret i64 %result
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 83c7f73800af19..dfe0e83649e203 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -8,57 +8,57 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x
 define fastcc i8 @allocno_reload_assign() {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, #0 // =0x0
-; CHECK-NEXT:    mov z16.d, #0 // =0x0
+; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z16.d, #0 // =0x0
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
+; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
+; CHECK-NEXT:    whilelo p1.b, xzr, x8
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
 ; CHECK-NEXT:    uunpkhi z7.s, z0.h
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
+; CHECK-NEXT:    punpkhi p3.h, p1.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    punpklo p5.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NEXT:    punpkhi p7.h, p0.b
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmov d17, xzr
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z17.d, #0
-; CHECK-NEXT:    uzp1 p2.s, p2.s, p0.s
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p2.b, p2.b, p0.b
-; CHECK-NEXT:    mov z17.b, p2/z, #1 // =0x1
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    whilelo p2.b, xzr, x8
-; CHECK-NEXT:    not p2.b, p1/z, p2.b
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    punpklo p4.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p5.h, p4.b
-; CHECK-NEXT:    punpkhi p4.h, p4.b
-; CHECK-NEXT:    st1b { z0.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p4, [z16.d]
-; CHECK-NEXT:    punpklo p4.h, p3.b
+; CHECK-NEXT:    punpklo p0.h, p2.b
+; CHECK-NEXT:    punpkhi p1.h, p2.b
+; CHECK-NEXT:    punpklo p2.h, p3.b
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    st1b { z2.d }, p4, [z16.d]
+; CHECK-NEXT:    punpklo p4.h, p5.b
+; CHECK-NEXT:    punpkhi p5.h, p5.b
+; CHECK-NEXT:    punpklo p6.h, p7.b
+; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
 ; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    punpklo p4.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p3, [z16.d]
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    st1b { z6.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
index 6effc63ecc13ce..fe3715341a25b8 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
@@ -9,20 +9,20 @@ target triple = "arm64-apple-macosx13.5.0"
 define i32 @nsis_BZ2_bzDecompress(ptr %pos.i, i1 %cmp661.not3117.i, i1 %exitcond.not.i) {
 ; CHECK-LABEL: nsis_BZ2_bzDecompress:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %while.end671.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    strb w9, [x0]
 ; CHECK-NEXT:    tbnz w2, #0, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2: // %for.body653.i
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0]
 ; CHECK-NEXT:    tbnz w1, #0, .LBB0_1
 ; CHECK-NEXT:  // %bb.3: // %while.body663.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    ldrb w9, [x9]
-; CHECK-NEXT:    strb wzr, [x0, x9]
+; CHECK-NEXT:    ldrb w10, [x8]
+; CHECK-NEXT:    strb wzr, [x0, x10]
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: // %for.end677.i
 ; CHECK-NEXT:    mov w0, wzr
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 866b27b81d885f..c91de8f3a0a471 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -8,36 +8,39 @@
 define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-LABEL: run_test:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #192
-; CHECK-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-NEXT:    sub sp, sp, #208
+; CHECK-NEXT:    .cfi_def_cfa_offset 208
 ; CHECK-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    str x23, [sp, #160] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset b8, -40
-; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    .cfi_offset b10, -56
-; CHECK-NEXT:    .cfi_offset b11, -64
-; CHECK-NEXT:    .cfi_offset b12, -72
-; CHECK-NEXT:    .cfi_offset b13, -80
-; CHECK-NEXT:    .cfi_offset b14, -88
-; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    .cfi_offset w23, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    .cfi_offset b10, -72
+; CHECK-NEXT:    .cfi_offset b11, -80
+; CHECK-NEXT:    .cfi_offset b12, -88
+; CHECK-NEXT:    .cfi_offset b13, -96
+; CHECK-NEXT:    .cfi_offset b14, -104
+; CHECK-NEXT:    .cfi_offset b15, -112
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    adrp x10, B+48
-; CHECK-NEXT:    add x10, x10, :lo12:B+48
-; CHECK-NEXT:    adrp x11, A
-; CHECK-NEXT:    add x11, x11, :lo12:A
+; CHECK-NEXT:    adrp x9, B+48
+; CHECK-NEXT:    add x9, x9, :lo12:B+48
+; CHECK-NEXT:    adrp x10, A
+; CHECK-NEXT:    add x10, x10, :lo12:A
+; CHECK-NEXT:    mov x11, xzr
 ; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:    // implicit-def: $q1
+; CHECK-NEXT:    mov x12, xzr
 ; CHECK-NEXT:    // implicit-def: $q0
 ; CHECK-NEXT:    // implicit-def: $q3
 ; CHECK-NEXT:    // implicit-def: $q4
@@ -69,103 +72,102 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q14, [x8]
-; CHECK-NEXT:    mov x12, xzr
-; CHECK-NEXT:    ldr x14, [x12]
 ; CHECK-NEXT:    stp q29, q15, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    add x19, x11, x8
-; CHECK-NEXT:    fmov x15, d14
-; CHECK-NEXT:    mov x16, v14.d[1]
-; CHECK-NEXT:    ldr q15, [x12]
-; CHECK-NEXT:    ldr q14, [x10], #64
+; CHECK-NEXT:    ldr q15, [x8]
+; CHECK-NEXT:    ldr x15, [x8]
+; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    add x20, x10, x11
 ; CHECK-NEXT:    mov v8.16b, v28.16b
-; CHECK-NEXT:    fmov x13, d15
-; CHECK-NEXT:    mov x18, v15.d[1]
+; CHECK-NEXT:    fmov x2, d15
+; CHECK-NEXT:    mov x17, v15.d[1]
+; CHECK-NEXT:    ldr q14, [x8]
 ; CHECK-NEXT:    mov v28.16b, v24.16b
-; CHECK-NEXT:    mul x17, x15, x14
-; CHECK-NEXT:    mov x12, v14.d[1]
-; CHECK-NEXT:    fmov x4, d14
 ; CHECK-NEXT:    mov v24.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v17.16b
+; CHECK-NEXT:    fmov x13, d14
+; CHECK-NEXT:    mov x16, v14.d[1]
 ; CHECK-NEXT:    mov v17.16b, v5.16b
-; CHECK-NEXT:    mul x1, x16, x14
+; CHECK-NEXT:    mul x3, x2, x15
+; CHECK-NEXT:    ldr q14, [x9], #64
 ; CHECK-NEXT:    ldr q5, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x5, [x8]
-; CHECK-NEXT:    ldr x19, [x19, #128]
+; CHECK-NEXT:    ldr x6, [x8]
+; CHECK-NEXT:    ldr x20, [x20, #128]
+; CHECK-NEXT:    mul x1, x17, x15
+; CHECK-NEXT:    mov x14, v14.d[1]
+; CHECK-NEXT:    fmov x5, d14
 ; CHECK-NEXT:    mov v29.16b, v21.16b
 ; CHECK-NEXT:    mov v21.16b, v0.16b
-; CHECK-NEXT:    mul x0, x13, x14
 ; CHECK-NEXT:    mov v25.16b, v6.16b
+; CHECK-NEXT:    mul x18, x13, x15
 ; CHECK-NEXT:    mov v6.16b, v2.16b
-; CHECK-NEXT:    fmov d15, x17
 ; CHECK-NEXT:    mov v26.16b, v22.16b
+; CHECK-NEXT:    fmov d15, x3
 ; CHECK-NEXT:    mov v22.16b, v18.16b
-; CHECK-NEXT:    mul x2, x18, x14
 ; CHECK-NEXT:    mov v18.16b, v7.16b
+; CHECK-NEXT:    mul x0, x16, x15
 ; CHECK-NEXT:    mov v7.16b, v3.16b
 ; CHECK-NEXT:    mov v16.16b, v4.16b
-; CHECK-NEXT:    add x8, x8, #8
-; CHECK-NEXT:    add x9, x9, #1
+; CHECK-NEXT:    add x11, x11, #8
+; CHECK-NEXT:    add x12, x12, #1
 ; CHECK-NEXT:    mov v15.d[1], x1
-; CHECK-NEXT:    mul x3, x12, x14
-; CHECK-NEXT:    cmp x8, #64
-; CHECK-NEXT:    fmov d14, x0
-; CHECK-NEXT:    mul x14, x4, x14
+; CHECK-NEXT:    mul x4, x14, x15
+; CHECK-NEXT:    cmp x11, #64
+; CHECK-NEXT:    fmov d14, x18
+; CHECK-NEXT:    mul x15, x5, x15
 ; CHECK-NEXT:    add v5.2d, v5.2d, v15.2d
-; CHECK-NEXT:    mul x20, x15, x5
-; CHECK-NEXT:    mov v14.d[1], x2
-; CHECK-NEXT:    mul x15, x15, x19
-; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    mul x21, x2, x6
+; CHECK-NEXT:    mov v14.d[1], x0
+; CHECK-NEXT:    mul x2, x2, x20
+; CHECK-NEXT:    fmov d0, x15
 ; CHECK-NEXT:    str q5, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q5, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mul x21, x13, x19
+; CHECK-NEXT:    mul x22, x13, x20
 ; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
-; CHECK-NEXT:    fmov d3, x20
-; CHECK-NEXT:    mul x7, x16, x5
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    fmov d1, x15
-; CHECK-NEXT:    mul x16, x16, x19
+; CHECK-NEXT:    fmov d3, x21
+; CHECK-NEXT:    mul x19, x17, x6
+; CHECK-NEXT:    mov v0.d[1], x4
+; CHECK-NEXT:    fmov d1, x2
+; CHECK-NEXT:    mul x17, x17, x20
 ; CHECK-NEXT:    str q5, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    add v5.2d, v13.2d, v14.2d
-; CHECK-NEXT:    fmov d2, x21
+; CHECK-NEXT:    fmov d2, x22
 ; CHECK-NEXT:    ldr q13, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    mul x6, x18, x5
+; CHECK-NEXT:    mul x7, x16, x6
 ; CHECK-NEXT:    ldp q15, q14, [sp, #16] // 32-byte Folded Reload
-; CHECK-NEXT:    mov v3.d[1], x7
+; CHECK-NEXT:    mov v3.d[1], x19
 ; CHECK-NEXT:    add v13.2d, v13.2d, v0.2d
-; CHECK-NEXT:    mul x18, x18, x19
-; CHECK-NEXT:    mov v1.d[1], x16
-; CHECK-NEXT:    mul x22, x4, x19
+; CHECK-NEXT:    mul x16, x16, x20
+; CHECK-NEXT:    mov v1.d[1], x17
+; CHECK-NEXT:    mul x23, x5, x20
 ; CHECK-NEXT:    str q13, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov v13.16b, v5.16b
 ; CHECK-NEXT:    mov v5.16b, v17.16b
 ; CHECK-NEXT:    mov v17.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v24.16b
-; CHECK-NEXT:    mul x13, x13, x5
+; CHECK-NEXT:    mul x13, x13, x6
 ; CHECK-NEXT:    mov v24.16b, v28.16b
 ; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
-; CHECK-NEXT:    mov v2.d[1], x18
+; CHECK-NEXT:    mov v2.d[1], x16
 ; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
 ; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
-; CHECK-NEXT:    mul x17, x12, x19
+; CHECK-NEXT:    mul x18, x14, x20
 ; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
 ; CHECK-NEXT:    add v19.2d, v19.2d, v3.2d
-; CHECK-NEXT:    fmov d4, x22
+; CHECK-NEXT:    fmov d4, x23
 ; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
-; CHECK-NEXT:    mul x14, x4, x5
+; CHECK-NEXT:    mul x15, x5, x6
 ; CHECK-NEXT:    fmov d0, x13
 ; CHECK-NEXT:    add v14.2d, v14.2d, v2.2d
 ; CHECK-NEXT:    add v2.2d, v6.2d, v3.2d
-; CHECK-NEXT:    mul x12, x12, x5
+; CHECK-NEXT:    mul x14, x14, x6
 ; CHECK-NEXT:    mov v3.16b, v7.16b
 ; CHECK-NEXT:    mov v7.16b, v18.16b
-; CHECK-NEXT:    mov v4.d[1], x17
+; CHECK-NEXT:    mov v4.d[1], x18
 ; CHECK-NEXT:    mov v18.16b, v22.16b
-; CHECK-NEXT:    mov v0.d[1], x6
-; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    mov v0.d[1], x7
+; CHECK-NEXT:    fmov d1, x15
 ; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
-; CHECK-NEXT:    mov v1.d[1], x12
+; CHECK-NEXT:    mov v1.d[1], x14
 ; CHECK-NEXT:    add v31.2d, v31.2d, v0.2d
 ; CHECK-NEXT:    add v30.2d, v30.2d, v0.2d
 ; CHECK-NEXT:    add v12.2d, v12.2d, v0.2d
@@ -192,11 +194,12 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    adrp x8, C
 ; CHECK-NEXT:    add x8, x8, :lo12:C
 ; CHECK-NEXT:    stp q11, q30, [x8, #80]
-; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
 ; CHECK-NEXT:    str q1, [x8]
 ; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x23, [sp, #160] // 8-byte Folded Reload
 ; CHECK-NEXT:    stp q15, q14, [x8, #144]
-; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q1, q13, [x8, #16]
 ; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q28, q12, [x8, #176]
@@ -216,12 +219,13 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    stp q5, q4, [x8, #432]
 ; CHECK-NEXT:    stp q2, q3, [x8, #464]
 ; CHECK-NEXT:    str q0, [x8, #496]
-; CHECK-NEXT:    add sp, sp, #192
+; CHECK-NEXT:    add sp, sp, #208
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
 ; CHECK-NEXT:    .cfi_restore w20
 ; CHECK-NEXT:    .cfi_restore w21
 ; CHECK-NEXT:    .cfi_restore w22
+; CHECK-NEXT:    .cfi_restore w23
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 9d4f9434aa3146..1a0fda3d54d3f4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,7 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -156,10 +157,10 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
-; GISEL-NEXT:        .sgpr_count:     0x26{{$}}
+; GISEL-NEXT:        .sgpr_count:     0x27{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x4{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x5{{$}}
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index bfc249e9081d22..340f0cdd5d5d07 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -245,6 +245,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -261,8 +262,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI5]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -282,6 +282,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
@@ -298,8 +299,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   successors: %bb.7(0x40000000), %bb.9(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI7]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -367,6 +367,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -382,8 +383,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -403,6 +403,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
@@ -418,8 +419,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   successors: %bb.7(0x40000000), %bb.9(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AVR/shift.ll b/llvm/test/CodeGen/AVR/shift.ll
index c0abc77c9b14ae..55ea509a8a5b67 100644
--- a/llvm/test/CodeGen/AVR/shift.ll
+++ b/llvm/test/CodeGen/AVR/shift.ll
@@ -60,13 +60,13 @@ define i64 @shift_i64_i64(i64 %a, i64 %b) {
 ; CHECK-NEXT:    breq .LBB3_3
 ; CHECK-NEXT:  ; %bb.1: ; %shift.loop.preheader
 ; CHECK-NEXT:    mov r27, r1
-; CHECK-NEXT:    mov r16, r1
-; CHECK-NEXT:    mov r17, r1
+; CHECK-NEXT:    mov r16, r27
+; CHECK-NEXT:    mov r17, r27
 ; CHECK-NEXT:  .LBB3_2: ; %shift.loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r31, r21
 ; CHECK-NEXT:    lsl r31
-; CHECK-NEXT:    mov r26, r1
+; CHECK-NEXT:    mov r26, r27
 ; CHECK-NEXT:    rol r26
 ; CHECK-NEXT:    lsl r22
 ; CHECK-NEXT:    rol r23
diff --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
new file mode 100644
index 00000000000000..e30bdfb939471f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
+
+define i32 @test(ptr %a, i64 %n)  {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vl1re32.v v9, (a0)
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    vredsum.vs v9, v9, v8
+; CHECK-NEXT:    vmv.x.s a3, v9
+; CHECK-NEXT:    addw a3, a3, a3
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    bnez a1, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.inc, %loop ]
+  %idx = getelementptr inbounds ptr, ptr %a, i64 %indvar
+  %data = load <vscale x 2 x i32>, ptr %idx
+  %reduce = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %data)
+  %sum.inc = add i32 %reduce, %reduce
+  %indvar.inc = add i64 %indvar, 1
+  %cmp = icmp eq i64 %indvar.inc, %n
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 %sum
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
index 42d6dac5b07fa3..5ced89c17c4208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
@@ -15,27 +15,30 @@ define void @foo(<vscale x 8 x i8> %0) {
 ; CHECK-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    .cfi_offset s1, -24
+; CHECK-NEXT:    .cfi_offset s2, -32
+; CHECK-NEXT:    li s0, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.x.s s0, v9
+; CHECK-NEXT:    vmv.x.s s1, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.x.s s1, v8
+; CHECK-NEXT:    vmv.x.s s2, v8
 ; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    mv a0, s0
-; CHECK-NEXT:    mv a2, s1
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    li a4, 0
-; CHECK-NEXT:    li a5, 0
-; CHECK-NEXT:    jalr a1
+; CHECK-NEXT:    mv a0, s1
+; CHECK-NEXT:    mv a1, s0
+; CHECK-NEXT:    mv a2, s2
+; CHECK-NEXT:    mv a3, s0
+; CHECK-NEXT:    mv a4, s0
+; CHECK-NEXT:    mv a5, s0
+; CHECK-NEXT:    jalr s0
 ; CHECK-NEXT:    j .LBB0_1
   %2 = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> undef, i64 0)
   %3 = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> poison, i64 0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
index 9da4d7ec9f2d05..4aa26d6b79ca46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
@@ -11,22 +11,22 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    addi a3, a2, 1
-; RV32-NEXT:    addi a4, a0, 1
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v8, v9, a2
+; RV32-NEXT:    addi a2, a0, 1
 ; RV32-NEXT:  .LBB0_1: # %for.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    th.lrb a0, a1, a0, 0
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vmv1r.v v10, v8
-; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a2
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
-; RV32-NEXT:    vmv.s.x v10, a0
-; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vmseq.vi v9, v10, 0
+; RV32-NEXT:    vmv1r.v v9, v8
+; RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; RV32-NEXT:    vmv.s.x v9, a0
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmseq.vi v9, v9, 0
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    andi a5, a0, 255
-; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    bnez a5, .LBB0_1
+; RV32-NEXT:    andi a3, a0, 255
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bnez a3, .LBB0_1
 ; RV32-NEXT:  # %bb.2: # %if.then381
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -37,23 +37,23 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a3
 ; RV64-NEXT:    addi a3, a2, 1
-; RV64-NEXT:    addi a4, a0, 1
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v8, v9, a2
+; RV64-NEXT:    addi a2, a0, 1
 ; RV64-NEXT:  .LBB0_1: # %for.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    th.lrb a0, a1, a0, 0
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vmv1r.v v10, v8
-; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a2
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
-; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vmseq.vi v9, v10, 0
+; RV64-NEXT:    vmv1r.v v9, v8
+; RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmseq.vi v9, v9, 0
 ; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    andi a5, a0, 255
-; RV64-NEXT:    mv a0, a4
-; RV64-NEXT:    bnez a5, .LBB0_1
+; RV64-NEXT:    andi a3, a0, 255
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    bnez a3, .LBB0_1
 ; RV64-NEXT:  # %bb.2: # %if.then381
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/vlenb.ll b/llvm/test/CodeGen/RISCV/vlenb.ll
index 1d6c1b5d1acbdc..26d4f99c3b9792 100644
--- a/llvm/test/CodeGen/RISCV/vlenb.ll
+++ b/llvm/test/CodeGen/RISCV/vlenb.ll
@@ -71,10 +71,13 @@ define void @machine_licm() {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -4
+; CHECK-NEXT:    .cfi_offset s0, -8
+; CHECK-NEXT:    csrr s0, vlenb
 ; CHECK-NEXT:  .LBB4_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    mv a0, s0
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    j .LBB4_1
 entry:

From 476a6d81a3648cf638400632c098e9f0ed025f8f Mon Sep 17 00:00:00 2001
From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com>
Date: Wed, 29 May 2024 02:12:26 -0400
Subject: [PATCH 064/230] [NFC] Construct Twines before concatenation (#90728)

Construct `Twine`s before concatenation.
---
 clang/lib/Driver/ToolChains/HIPUtility.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index b1ff697b368b13..f32a23f111e4bf 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -106,9 +106,9 @@ class HIPUndefinedFatBinSymbols {
         std::string ID = IA->getId().str();
         if (!ID.empty()) {
           ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true);
-          FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str());
+          FatBinSymbols.insert((FatBinPrefix + Twine('_') + ID).str());
           GPUBinHandleSymbols.insert(
-              Twine(GPUBinHandlePrefix + "_" + ID).str());
+              (GPUBinHandlePrefix + Twine('_') + ID).str());
           continue;
         }
         if (IA->getInputArg().getNumValues() == 0)

From 7f58ffd09b29d3ff4f9fa025bd4d05dd8fd9fc38 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Wed, 29 May 2024 08:43:13 +0200
Subject: [PATCH 065/230] [mlir][python] Yield results of `scf.for_` (#93610)

Using `for_` is very hand with python bindings. Currently, it doesn't
support results, we had to fallback to two lines scf.for.

This PR yields results of scf.for in `for_`

---------

Co-authored-by: Maksim Levental <maksim.levental@gmail.com>
---
 mlir/python/mlir/dialects/scf.py |  4 +--
 mlir/test/python/dialects/scf.py | 50 ++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index dad7377987e56c..7025f6e0f1a166 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -132,8 +132,8 @@ def for_(
     iter_args = tuple(for_op.inner_iter_args)
     with InsertionPoint(for_op.body):
         if len(iter_args) > 1:
-            yield iv, iter_args
+            yield iv, iter_args, for_op.results
         elif len(iter_args) == 1:
-            yield iv, iter_args[0]
+            yield iv, iter_args[0], for_op.results[0]
         else:
             yield iv
diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py
index ee8d09aa301d98..95a6de86b670d5 100644
--- a/mlir/test/python/dialects/scf.py
+++ b/mlir/test/python/dialects/scf.py
@@ -176,6 +176,56 @@ def range_loop_7(lb, ub, step, memref_v):
             memref.store(add, memref_v, [i])
             scf.yield_([])
 
+    # CHECK:  func.func @loop_yield_1(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) {
+    # CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_5:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_7:.*]] = arith.constant 100 : index
+    # CHECK:    %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:    %[[VAL_10:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER:.*]] = %[[VAL_4]]) -> (index) {
+    # CHECK:      %[[VAL_9:.*]] = arith.addi %[[ITER]], %[[IV]] : index
+    # CHECK:      scf.yield %[[VAL_9]] : index
+    # CHECK:    }
+    # CHECK:    memref.store %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_5]]] : memref<10xindex>
+    # CHECK:    return
+    # CHECK:  }
+    @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t)
+    def loop_yield_1(lb, ub, step, memref_v):
+        sum = arith.ConstantOp.create_index(0)
+        c0 = arith.ConstantOp.create_index(0)
+        for i, loc_sum, sum in scf.for_(0, 100, 1, [sum]):
+            loc_sum = arith.addi(loc_sum, i)
+            scf.yield_([loc_sum])
+        memref.store(sum, memref_v, [c0])
+
+    # CHECK:  func.func @loop_yield_2(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) {
+    # CHECK:    %[[c0:.*]] = arith.constant 0 : index
+    # CHECK:    %[[c2:.*]] = arith.constant 2 : index
+    # CHECK:    %[[REF1:.*]] = arith.constant 0 : index
+    # CHECK:    %[[REF2:.*]] = arith.constant 1 : index
+    # CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_7:.*]] = arith.constant 100 : index
+    # CHECK:    %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:    %[[RES:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER1:.*]] = %[[c0]], %[[ITER2:.*]] = %[[c2]]) -> (index, index) {
+    # CHECK:      %[[VAL_9:.*]] = arith.addi %[[ITER1]], %[[IV]] : index
+    # CHECK:      %[[VAL_10:.*]] = arith.addi %[[ITER2]], %[[IV]] : index
+    # CHECK:      scf.yield %[[VAL_9]], %[[VAL_10]] : index, index
+    # CHECK:    }
+    # CHECK:    return
+    # CHECK:  }
+    @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t)
+    def loop_yield_2(lb, ub, step, memref_v):
+        sum1 = arith.ConstantOp.create_index(0)
+        sum2 = arith.ConstantOp.create_index(2)
+        c0 = arith.ConstantOp.create_index(0)
+        c1 = arith.ConstantOp.create_index(1)
+        for i, [loc_sum1, loc_sum2], [sum1, sum2] in scf.for_(0, 100, 1, [sum1, sum2]):
+            loc_sum1 = arith.addi(loc_sum1, i)
+            loc_sum2 = arith.addi(loc_sum2, i)
+            scf.yield_([loc_sum1, loc_sum2])
+        memref.store(sum1, memref_v, [c0])
+        memref.store(sum2, memref_v, [c1])
+
 
 @constructAndPrintInModule
 def testOpsAsArguments():

From c2a9a974ca85e4ac4509e368d4b9acae7e67bf71 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Sat, 18 May 2024 16:46:03 +0200
Subject: [PATCH 066/230] [LICM] Introduce test for PR92655 (NFC)

---
 .../LICM/update-scev-after-hoist.ll           | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 llvm/test/Transforms/LICM/update-scev-after-hoist.ll

diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
new file mode 100644
index 00000000000000..f834a74b6f247c
--- /dev/null
+++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -passes='loop-unroll,loop-mssa(licm),print<scalar-evolution>' -unroll-count=4 -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCEV-EXPR
+
+define i16 @main() {
+; SCEV-EXPR:      Classifying expressions for: @main
+; SCEV-EXPR-NEXT:  %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ]
+; SCEV-EXPR-NEXT:  -->  %mul U: full-set S: [-32768,32753)		Exits: 4096		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ]
+; SCEV-EXPR-NEXT:  -->  %div U: [-2048,-32768) S: [-2048,-32768)		Exits: 7		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  %mul.n = mul i16 %mul, 8
+; SCEV-EXPR-NEXT:  -->  (2 * %mul) U: [0,-1) S: [-32768,32767)		Exits: 8192		LoopDispositions: { %loop: Variant }
+entry:
+  br label %loop
+
+loop:
+  %mul = phi i16 [ 1, %entry ], [ %mul.n, %loop ]
+  %div = phi i16 [ 32767, %entry ], [ %div.n, %loop ]
+  %mul.n = mul i16 %mul, 2
+  %div.n = sdiv i16 %div, 2
+  %cmp = icmp sgt i16 %div, 0
+  br i1 %cmp, label %loop, label %end
+
+end:
+  ret i16 %mul
+}

From 70091dc943ade280d75cea1e5ea5e93d9a8f934a Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Sat, 18 May 2024 16:52:17 +0200
Subject: [PATCH 067/230] [LICM] Invalidate cached SCEV results in
 `hoistMulAddAssociation`

While reassociating expressions, LICM is required to invalidate SCEV
results, as otherwise subsequent passes in the pipeline that leverage
LICM foldings (e.g. IndVars), may reason on invalid expressions; thus
miscompiling. This is achieved by rewriting the reassociable
instruction from scratch.

Fixes: https://github.com/llvm/llvm-project/issues/91957.
---
 llvm/lib/Transforms/Scalar/LICM.cpp              | 16 ++++++++++++++--
 .../Transforms/LICM/update-scev-after-hoist.ll   |  6 +++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 6aa4188d1cc4d4..5eccf7b4adb65e 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2751,7 +2751,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   IRBuilder<> Builder(Preheader->getTerminator());
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
-    Instruction *Ins = cast<Instruction>(U->getUser());
+    auto *Ins = cast<BinaryOperator>(U->getUser());
     Value *Mul;
     if (I.getType()->isIntOrIntVectorTy()) {
       Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
@@ -2759,8 +2759,20 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
       Ins->dropPoisonGeneratingFlags();
     } else
       Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
-    U->set(Mul);
+
+    // Rewrite the reassociable instruction.
+    unsigned OpIdx = U->getOperandNo();
+    auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0);
+    auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1);
+    auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
+                                         Ins->getName() + ".reass", Ins);
+    NewBO->copyIRFlags(Ins);
+    if (VariantOp == Ins)
+      VariantOp = NewBO;
+    Ins->replaceAllUsesWith(NewBO);
+    eraseInstruction(*Ins, SafetyInfo, MSSAU);
   }
+
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
   return true;
diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
index f834a74b6f247c..fc45b8fce1766a 100644
--- a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
+++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
@@ -3,11 +3,11 @@
 define i16 @main() {
 ; SCEV-EXPR:      Classifying expressions for: @main
 ; SCEV-EXPR-NEXT:  %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ]
-; SCEV-EXPR-NEXT:  -->  %mul U: full-set S: [-32768,32753)		Exits: 4096		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  -->  %mul U: [0,-15) S: [-32768,32753)		Exits: 4096		LoopDispositions: { %loop: Variant }
 ; SCEV-EXPR-NEXT:  %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ]
 ; SCEV-EXPR-NEXT:  -->  %div U: [-2048,-32768) S: [-2048,-32768)		Exits: 7		LoopDispositions: { %loop: Variant }
-; SCEV-EXPR-NEXT:  %mul.n = mul i16 %mul, 8
-; SCEV-EXPR-NEXT:  -->  (2 * %mul) U: [0,-1) S: [-32768,32767)		Exits: 8192		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  %mul.n.reass.reass = mul i16 %mul, 8
+; SCEV-EXPR-NEXT:  -->  (8 * %mul) U: [0,-7) S: [-32768,32761)		Exits: -32768		LoopDispositions: { %loop: Variant }
 entry:
   br label %loop
 

From 53d79feec93ef99e2ba0ac8cfc6cf2f81d28bf8a Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 29 May 2024 09:17:24 +0200
Subject: [PATCH 068/230] [lldb/DWARF] Bypass the compres^Wconstruction of
 DIERefs in debug_names (#93296)

DebugNamesDWARFIndex was jumping through hoops to construct a DIERef
from an index entry only to jump through them back a short while later
to construct a DWARFDIE.

This used to be necessary as the index lookup was a two stage process,
where we first enumerated all matches, and then examined them (so it was
important that the enumeration was cheap -- does not trigger unnecessary
parsing). However, now that the processing is callback based, we are
always immediately examining the DWARFDIE right after finding the entry,
and the DIERef just gets in the way.
---
 .../SymbolFile/DWARF/AppleDWARFIndex.cpp      |  8 ++-
 .../Plugins/SymbolFile/DWARF/DWARFIndex.cpp   |  7 +--
 .../Plugins/SymbolFile/DWARF/DWARFIndex.h     |  9 ++-
 .../SymbolFile/DWARF/DebugNamesDWARFIndex.cpp | 58 ++++++++-----------
 .../SymbolFile/DWARF/DebugNamesDWARFIndex.h   |  2 +-
 5 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
index 33537df4f50762..1703597a7cd2fd 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
@@ -284,8 +284,12 @@ void AppleDWARFIndex::GetFunctions(
   for (const auto &entry : m_apple_names_up->equal_range(name)) {
     DIERef die_ref(std::nullopt, DIERef::Section::DebugInfo,
                    *entry.getDIESectionOffset());
-    if (!ProcessFunctionDIE(lookup_info, die_ref, dwarf, parent_decl_ctx,
-                            callback))
+    DWARFDIE die = dwarf.GetDIE(die_ref);
+    if (!die) {
+      ReportInvalidDIERef(die_ref, name);
+      continue;
+    }
+    if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx, callback))
       return;
   }
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
index 20c07a94b50769..30fb5d5ebdb0df 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
@@ -24,16 +24,11 @@ using namespace lldb_private::plugin::dwarf;
 DWARFIndex::~DWARFIndex() = default;
 
 bool DWARFIndex::ProcessFunctionDIE(
-    const Module::LookupInfo &lookup_info, DIERef ref, SymbolFileDWARF &dwarf,
+    const Module::LookupInfo &lookup_info, DWARFDIE die,
     const CompilerDeclContext &parent_decl_ctx,
     llvm::function_ref<bool(DWARFDIE die)> callback) {
   llvm::StringRef name = lookup_info.GetLookupName().GetStringRef();
   FunctionNameType name_type_mask = lookup_info.GetNameTypeMask();
-  DWARFDIE die = dwarf.GetDIE(ref);
-  if (!die) {
-    ReportInvalidDIERef(ref, name);
-    return true;
-  }
 
   if (!(name_type_mask & eFunctionNameTypeFull)) {
     ConstString name_to_match_against;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
index 0551b07100a96b..cb3ae8a06d7885 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
@@ -81,11 +81,10 @@ class DWARFIndex {
   StatsDuration m_index_time;
 
   /// Helper function implementing common logic for processing function dies. If
-  /// the function given by "ref" matches search criteria given by
-  /// "parent_decl_ctx" and "name_type_mask", it is inserted into the "dies"
-  /// vector.
-  bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DIERef ref,
-                          SymbolFileDWARF &dwarf,
+  /// the function given by "die" matches search criteria given by
+  /// "parent_decl_ctx" and "name_type_mask", it calls the callback with the
+  /// given die.
+  bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DWARFDIE die,
                           const CompilerDeclContext &parent_decl_ctx,
                           llvm::function_ref<bool(DWARFDIE die)> callback);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index c98e5481609dea..56717bab1ecd86 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -64,27 +64,25 @@ DebugNamesDWARFIndex::GetNonSkeletonUnit(const DebugNames::Entry &entry) const {
   return cu ? &cu->GetNonSkeletonUnit() : nullptr;
 }
 
-std::optional<DIERef>
-DebugNamesDWARFIndex::ToDIERef(const DebugNames::Entry &entry) const {
+DWARFDIE DebugNamesDWARFIndex::GetDIE(const DebugNames::Entry &entry) const {
   DWARFUnit *unit = GetNonSkeletonUnit(entry);
-  if (!unit)
-    return std::nullopt;
-  if (std::optional<uint64_t> die_offset = entry.getDIEUnitOffset())
-    return DIERef(unit->GetSymbolFileDWARF().GetFileIndex(),
-                  DIERef::Section::DebugInfo, unit->GetOffset() + *die_offset);
-
-  return std::nullopt;
+  std::optional<uint64_t> die_offset = entry.getDIEUnitOffset();
+  if (!unit || !die_offset)
+    return DWARFDIE();
+  if (DWARFDIE die = unit->GetDIE(unit->GetOffset() + *die_offset))
+    return die;
+
+  m_module.ReportErrorIfModifyDetected(
+      "the DWARF debug information has been modified (bad offset {0:x} in "
+      "debug_names section)\n",
+      *die_offset);
+  return DWARFDIE();
 }
 
 bool DebugNamesDWARFIndex::ProcessEntry(
     const DebugNames::Entry &entry,
     llvm::function_ref<bool(DWARFDIE die)> callback) {
-  std::optional<DIERef> ref = ToDIERef(entry);
-  if (!ref)
-    return true;
-  SymbolFileDWARF &dwarf = *llvm::cast<SymbolFileDWARF>(
-      m_module.GetSymbolFile()->GetBackingSymbolFile());
-  DWARFDIE die = dwarf.GetDIE(*ref);
+  DWARFDIE die = GetDIE(entry);
   if (!die)
     return true;
   // Clang erroneously emits index entries for declaration DIEs in case when the
@@ -187,7 +185,7 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
     llvm::function_ref<bool(DWARFDIE die)> callback) {
   // Keep a list of incomplete types as fallback for when we don't find the
   // complete type.
-  DIEArray incomplete_types;
+  std::vector<DWARFDIE> incomplete_types;
 
   for (const DebugNames::Entry &entry :
        m_debug_names_up->equal_range(class_name.GetStringRef())) {
@@ -195,19 +193,14 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
         entry.tag() != DW_TAG_class_type)
       continue;
 
-    std::optional<DIERef> ref = ToDIERef(entry);
-    if (!ref)
-      continue;
-
-    DWARFUnit *cu = m_debug_info.GetUnit(*ref);
-    if (!cu || !cu->Supports_DW_AT_APPLE_objc_complete_type()) {
-      incomplete_types.push_back(*ref);
+    DWARFDIE die = GetDIE(entry);
+    if (!die) {
+      // Report invalid
       continue;
     }
-
-    DWARFDIE die = m_debug_info.GetDIE(*ref);
-    if (!die) {
-      ReportInvalidDIERef(*ref, class_name.GetStringRef());
+    DWARFUnit *cu = die.GetCU();
+    if (!cu->Supports_DW_AT_APPLE_objc_complete_type()) {
+      incomplete_types.push_back(die);
       continue;
     }
 
@@ -216,12 +209,11 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
       callback(die);
       return;
     }
-    incomplete_types.push_back(*ref);
+    incomplete_types.push_back(die);
   }
 
-  auto dierefcallback = DIERefCallback(callback, class_name.GetStringRef());
-  for (DIERef ref : incomplete_types)
-    if (!dierefcallback(ref))
+  for (DWARFDIE die : incomplete_types)
+    if (!callback(die))
       return;
 
   m_fallback.GetCompleteObjCClass(class_name, must_be_implementation, callback);
@@ -383,8 +375,8 @@ void DebugNamesDWARFIndex::GetFunctions(
     if (tag != DW_TAG_subprogram && tag != DW_TAG_inlined_subroutine)
       continue;
 
-    if (std::optional<DIERef> ref = ToDIERef(entry)) {
-      if (!ProcessFunctionDIE(lookup_info, *ref, dwarf, parent_decl_ctx,
+    if (DWARFDIE die = GetDIE(entry)) {
+      if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx,
                               [&](DWARFDIE die) {
                                 if (!seen.insert(die.GetDIE()).second)
                                   return true;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
index 81fb8f88b805af..a27a414ecdd193 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
@@ -84,7 +84,7 @@ class DebugNamesDWARFIndex : public DWARFIndex {
   ManualDWARFIndex m_fallback;
 
   DWARFUnit *GetNonSkeletonUnit(const DebugNames::Entry &entry) const;
-  std::optional<DIERef> ToDIERef(const DebugNames::Entry &entry) const;
+  DWARFDIE GetDIE(const DebugNames::Entry &entry) const;
   bool ProcessEntry(const DebugNames::Entry &entry,
                     llvm::function_ref<bool(DWARFDIE die)> callback);
 

From 2cfea14a57ad8443c6898d2310abb4346dc92ad2 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 29 May 2024 09:27:32 +0200
Subject: [PATCH 069/230] [lldb-dap] Add timestamps to protocol logs (#93540)

I've found them very useful as a rudimentary form of benchmark.
---
 lldb/tools/lldb-dap/DAP.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index c7eb3db4304a90..d419f821999e6c 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -103,7 +103,9 @@ void DAP::SendJSON(const llvm::json::Value &json) {
   SendJSON(json_str);
 
   if (log) {
-    *log << "<-- " << std::endl
+    auto now = std::chrono::duration<double>(
+        std::chrono::system_clock::now().time_since_epoch());
+    *log << llvm::formatv("{0:f9} <-- ", now.count()).str() << std::endl
          << "Content-Length: " << json_str.size() << "\r\n\r\n"
          << llvm::formatv("{0:2}", json).str() << std::endl;
   }
@@ -130,9 +132,12 @@ std::string DAP::ReadJSON() {
   if (!input.read_full(log.get(), length, json_str))
     return json_str;
 
-  if (log)
-    *log << "--> " << std::endl << "Content-Length: " << length << "\r\n\r\n";
-
+  if (log) {
+    auto now = std::chrono::duration<double>(
+        std::chrono::system_clock::now().time_since_epoch());
+    *log << llvm::formatv("{0:f9} --> ", now.count()).str() << std::endl
+         << "Content-Length: " << length << "\r\n\r\n";
+  }
   return json_str;
 }
 

From 98714866830f505d7bb87de6b92a28f280a34b9b Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Mar 2024 12:04:58 -0500
Subject: [PATCH 070/230] [InstCombine] Add multiuse tests for canonicalizing
 (icmp eq/ne (and x, y), {x,y}); NFC

---
 .../test/Transforms/InstCombine/icmp-of-and-x.ll | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
index e95c72b75f97df..75070e5a34f949 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
@@ -3,6 +3,7 @@
 
 declare i1 @barrier()
 declare void @llvm.assume(i1)
+declare void @use.i8(i8)
 
 define i1 @icmp_ult_x_y(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_ult_x_y(
@@ -262,6 +263,21 @@ define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) {
   ret i1 %r
 }
 
+define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_eq_x_invertable_y_fail_multiuse(
+; CHECK-NEXT:    [[YY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[AND]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %yy = xor i8 %y, -1
+  %and = and i8 %x, %yy
+  call void @use.i8(i8 %and)
+  %r = icmp eq i8 %x, %and
+  ret i1 %r
+}
+
 define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo(
 ; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24

From 5532ab17327f2887fdac739ffaaae6c341695370 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 13 Sep 2023 13:45:58 -0500
Subject: [PATCH 071/230] [InstCombine] Make the `(icmp eq/ne (and X, Y), X)`
 canonicalization work for non-const operands

We currently do:
    `(icmp eq/ne (and X, Y), Y)` -> `(icmp eq/ne (and ~X, Y), 0)`
if `X` is constant. We can make this more general and do it if `X` is
freely invertable (i.e say `X = ~Z`).

As well, we can also do:
    `(icmp eq/ne (and X, Y), Y)` -> `(icmp eq/ne (or X, ~Y), -1)`
If `Y` is freely invertible.

Proofs: https://alive2.llvm.org/ce/z/yeWH3E

Differential Revision: https://reviews.llvm.org/D159059

Closes #84688
---
 .../InstCombine/InstCombineCompares.cpp       | 30 +++++-----
 .../ValueTracking/known-power-of-two-urem.ll  | 18 +++---
 ...low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll | 17 +++---
 ...low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll | 17 +++---
 ...low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll |  8 +--
 ...low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll |  8 +--
 .../InstCombine/icmp-and-lowbit-mask.ll       | 59 +++++++++----------
 .../Transforms/InstCombine/icmp-of-and-x.ll   | 20 +++----
 8 files changed, 88 insertions(+), 89 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c3272d97509f53..89193f8ff94b6e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4730,6 +4730,21 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q,
   if (Pred == ICmpInst::ICMP_UGE)
     return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
 
+  if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) {
+    // icmp (X & Y) eq/ne Y --> (X | ~Y) eq/ne -1 if Y is freely invertible and
+    // Y is non-constant. If Y is constant the `X & C == C` form is preferable
+    // so don't do this fold.
+    if (!match(Op1, m_ImmConstant()))
+      if (auto *NotOp1 =
+              IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder))
+        return new ICmpInst(Pred, IC.Builder.CreateOr(A, NotOp1),
+                            Constant::getAllOnesValue(Op1->getType()));
+    // icmp (X & Y) eq/ne Y --> (~X & Y) eq/ne 0 if X  is freely invertible.
+    if (auto *NotA = IC.getFreelyInverted(A, A->hasOneUse(), &IC.Builder))
+      return new ICmpInst(Pred, IC.Builder.CreateAnd(Op1, NotA),
+                          Constant::getNullValue(Op1->getType()));
+  }
+
   return nullptr;
 }
 
@@ -5505,21 +5520,6 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
     }
   }
 
-  // canoncalize:
-  // (icmp eq/ne (and X, C), X)
-  //    -> (icmp eq/ne (and X, ~C), 0)
-  {
-    Constant *CMask;
-    A = nullptr;
-    if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_ImmConstant(CMask)))))
-      A = Op1;
-    else if (match(Op1, m_OneUse(m_And(m_Specific(Op0), m_ImmConstant(CMask)))))
-      A = Op0;
-    if (A)
-      return new ICmpInst(Pred, Builder.CreateAnd(A, Builder.CreateNot(CMask)),
-                          Constant::getNullValue(A->getType()));
-  }
-
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
     // A == (A^B)  ->  B == 0
     Value *OtherVal = A == Op0 ? B : A;
diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
index 47c4587f6991bd..ba3a484441e9e3 100644
--- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
+++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
@@ -428,9 +428,9 @@ define i8 @known_power_of_two_lshr_add_one_allow_zero(i8 %x, i8 %y) {
 define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @known_power_of_two_lshr_add_one_nuw_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add nuw i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
@@ -445,9 +445,9 @@ define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) {
 define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
@@ -462,9 +462,9 @@ define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) {
 define i1 @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add nsw i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
index 88487b38e2c708..0a7de501ca0225 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
@@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[X_HIGHBITS]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
-; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y ; not -1
@@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) {
 define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -2
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
index b717925fd644fc..54ff87676e71d9 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
@@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[X_HIGHBITS]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
-; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y ; not -1
@@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) {
 define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -2
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
index a65be1e9ceeca3..c7c57b601eab38 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
@@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0]], -1
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y ; not 1
@@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add nuw i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = sub nuw i8 -2, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
index f156d9bf007cbb..d5826524f1637c 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
@@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0]], -1
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y ; not 1
@@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add nuw i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = sub nuw i8 -2, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
index 5de3e89d7027ab..8bb7fd0e522cb0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
@@ -5,9 +5,9 @@ declare void @use.i8(i8)
 declare void @use.i16(i16)
 define i1 @src_is_mask_zext(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_zext(
-; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[M_IN:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = zext i8 [[M_IN]] to i16
+; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i16 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -22,11 +22,11 @@ define i1 @src_is_mask_zext(i16 %x_in, i8 %y) {
 
 define i1 @src_is_mask_zext_fail_not_mask(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_zext_fail_not_mask(
-; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[M_IN:%.*]] = lshr i8 -2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = zext i8 [[M_IN]] to i16
-; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i16 [[TMP1]], [[MASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i16 %x_in, 123
@@ -80,10 +80,10 @@ define i1 @src_is_mask_sext_fail_multiuse(i16 %x_in, i8 %y) {
 
 define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_and(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = lshr i8 7, [[Y:%.*]]
 ; CHECK-NEXT:    [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], [[MZ]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -99,12 +99,12 @@ define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_and_fail_mixed(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = ashr i8 -8, [[Y:%.*]]
 ; CHECK-NEXT:    [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], [[MZ]]
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[X]], [[AND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -119,9 +119,9 @@ define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_or(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_or(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], 7
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -136,9 +136,9 @@ define i1 @src_is_mask_or(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_xor(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_xor(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[MASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -152,11 +152,11 @@ define i1 @src_is_mask_xor(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_xor_fail_notmask(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 0, [[Y:%.*]]
 ; CHECK-NEXT:    [[NOTMASK:%.*]] = xor i8 [[TMP1]], [[Y]]
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[NOTMASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[NOTMASK]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -170,10 +170,10 @@ define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_select(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_select(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -245,11 +245,11 @@ define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) {
 
 define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_lshr(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr i8 [[SMASK]], [[Z:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -265,11 +265,11 @@ define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 
 define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_ashr(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
 ; CHECK-NEXT:    [[MASK:%.*]] = ashr i8 [[SMASK]], [[Z:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -285,9 +285,9 @@ define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 
 define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_p2_m1(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[P2ORZ:%.*]] = shl i8 2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = add i8 [[P2ORZ]], -1
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -301,10 +301,10 @@ define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_umax(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_umax(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umax.i8(i8 [[YMASK]], i8 3)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -320,11 +320,11 @@ define i1 @src_is_mask_umax(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_umin(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[ZMASK:%.*]] = lshr i8 15, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 [[ZMASK]])
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -341,12 +341,12 @@ define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_umin_fail_mismatch(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 -32)
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -361,10 +361,10 @@ define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_smax(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_smax(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.smax.i8(i8 [[YMASK]], i8 -1)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -380,10 +380,10 @@ define i1 @src_is_mask_smax(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_smin(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_smin(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.smin.i8(i8 [[YMASK]], i8 0)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -399,9 +399,9 @@ define i1 @src_is_mask_smin(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_bitreverse_not_mask(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[NMASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[NMASK]])
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -417,7 +417,7 @@ define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) {
 define i1 @src_is_notmask_sext(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_notmask_sext(
 ; CHECK-NEXT:    [[M_IN:%.*]] = shl i8 -8, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -128
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[M_IN]] to i16
 ; CHECK-NEXT:    [[R:%.*]] = icmp uge i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -529,12 +529,11 @@ define i1 @src_is_notmask_lshr_shl(i8 %x_in, i8 %y) {
 
 define i1 @src_is_notmask_lshr_shl_fail_mismatch_shifts(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_notmask_lshr_shl_fail_mismatch_shifts(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MASK_SHR:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[NMASK:%.*]] = shl i8 [[MASK_SHR]], [[Z:%.*]]
-; CHECK-NEXT:    [[MASK:%.*]] = xor i8 [[NMASK]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], [[NMASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
index 75070e5a34f949..0f26be12c39cc6 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
@@ -239,9 +239,9 @@ define i1 @icmp_sle_negx_y_fail_maybe_zero(i8 %x, i8 %y) {
 
 define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y_todo(
-; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24
+; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = select i1 %y, i8 7, i8 24
@@ -252,9 +252,8 @@ define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) {
 
 define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y(
-; CHECK-NEXT:    [[YY:%.*]] = xor i8 [[Y:%.*]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = xor i8 %y, -1
@@ -280,9 +279,9 @@ define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) {
 
 define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo(
-; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[YY]], [[AND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = select i1 %y, i8 7, i8 24
@@ -293,9 +292,8 @@ define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) {
 
 define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y2(
-; CHECK-NEXT:    [[YY:%.*]] = xor i8 [[Y:%.*]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[YY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP1]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = xor i8 %y, -1

From a9e8a3a18eb897196f88d3705ccd966f5b52c012 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 29 May 2024 14:34:34 +0800
Subject: [PATCH 072/230] [X86][CodeGen] Extend X86CompressEVEX for NF
 transform

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp     |  43 ++-
 llvm/lib/Target/X86/X86InstrInfo.cpp        |   8 +
 llvm/lib/Target/X86/X86InstrInfo.h          |   3 +
 llvm/test/CodeGen/X86/apx/add.ll            | 319 +++++++++++++++++++
 llvm/test/CodeGen/X86/apx/and.ll            | 317 +++++++++++++++++++
 llvm/test/CodeGen/X86/apx/compress-evex.mir |  26 +-
 llvm/test/CodeGen/X86/apx/dec.ll            |  64 ++++
 llvm/test/CodeGen/X86/apx/imul.ll           |  62 ++++
 llvm/test/CodeGen/X86/apx/inc.ll            |  95 ++++++
 llvm/test/CodeGen/X86/apx/neg.ll            | 103 +++++++
 llvm/test/CodeGen/X86/apx/or.ll             | 315 +++++++++++++++++++
 llvm/test/CodeGen/X86/apx/shl.ll            | 276 +++++++++++++++++
 llvm/test/CodeGen/X86/apx/shr.ll            | 277 +++++++++++++++++
 llvm/test/CodeGen/X86/apx/sub.ll            | 323 ++++++++++++++++++++
 llvm/test/CodeGen/X86/apx/xor.ll            | 292 ++++++++++++++++++
 15 files changed, 2505 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 6442cc21933085..cadfda93d4b196 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -14,6 +14,7 @@
 //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
 //   c. NDD (EVEX) -> non-NDD (legacy)
 //   d. NF_ND (EVEX) -> NF (EVEX)
+//   e. NonNF (EVEX) -> NF (EVEX)
 //
 // Compression a, b and c can always reduce code size, with some exceptions
 // such as promoted 16-bit CRC32 which is as long as the legacy version.
@@ -30,6 +31,9 @@
 //
 // Compression d can help hardware decode (HW may skip reading the NDD
 // register) although the instruction length remains unchanged.
+//
+// Compression e can help hardware skip updating EFLAGS although the instruction
+// length remains unchanged.
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
@@ -219,25 +223,36 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     return false;
   // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
   bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
-  if (IsNDLike && !isRedundantNewDataDest(MI, ST))
+  bool IsRedundantNDD = IsNDLike ? isRedundantNewDataDest(MI, ST) : false;
+  // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
+  // dead.
+  unsigned NFOpc = (ST.hasNF() && !IsRedundantNDD &&
+                    MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr))
+                       ? X86::getNFVariant(Opc)
+                       : 0U;
+  if (IsNDLike && !IsRedundantNDD && !NFOpc)
     return false;
 
-  ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
-
-  Opc = MI.getOpcode();
-  const auto *I = llvm::lower_bound(Table, Opc);
-  if (I == Table.end() || I->OldOpc != Opc) {
-    assert(!IsNDLike && "Missing entry for ND-like instruction");
-    return false;
-  }
+  unsigned NewOpc = NFOpc;
+  if (!NewOpc) {
+    ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
 
-  if (!IsNDLike) {
-    if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
-        !performCustomAdjustments(MI, I->NewOpc))
+    Opc = MI.getOpcode();
+    const auto I = llvm::lower_bound(Table, Opc);
+    if (I == Table.end() || I->OldOpc != Opc) {
+      assert(!IsNDLike && "Missing entry for ND-like instruction");
       return false;
+    }
+
+    if (!IsNDLike) {
+      if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
+          !performCustomAdjustments(MI, I->NewOpc))
+        return false;
+    }
+    NewOpc = I->NewOpc;
   }
 
-  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
+  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc);
   MI.setDesc(NewDesc);
   unsigned AsmComment;
   switch (NewDesc.TSFlags & X86II::EncodingMask) {
@@ -256,7 +271,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     llvm_unreachable("Unknown EVEX compression");
   }
   MI.setAsmPrinterFlag(AsmComment);
-  if (IsNDLike)
+  if (IsRedundantNDD)
     MI.tieOperands(0, 1);
 
   return true;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7d05f950b6fe99..3e391da807889f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3221,6 +3221,14 @@ int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) {
   }
 }
 
+#define GET_X86_NF_TRANSFORM_TABLE
+#include "X86GenInstrMapping.inc"
+unsigned X86::getNFVariant(unsigned Opc) {
+  ArrayRef<X86TableEntry> Table = ArrayRef(X86NFTransformTable);
+  const auto I = llvm::lower_bound(Table, Opc);
+  return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
+}
+
 /// Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 295fac60c6e406..9eb2bd56b2ab5c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -77,6 +77,9 @@ CondCode getCondFromCCMP(const MachineInstr &MI);
 // Turn condition code into condition flags for CCMP/CTEST.
 int getCCMPCondFlagsFromCondCode(CondCode CC);
 
+// Get the opcode of corresponding NF variant.
+unsigned getNFVariant(unsigned Opc);
+
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
 /// e.g. turning COND_E to COND_NE.
 CondCode GetOppositeBranchCondition(CondCode CC);
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index d3301ecdb72d0f..7779ae599f2004 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @add8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: add8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i8 %a, %b
     ret i8 %add
@@ -17,6 +23,12 @@ define i16 @add16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, %b
     ret i16 %add
@@ -27,6 +39,11 @@ define i32 @add32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, %b
     ret i32 %add
@@ -37,6 +54,11 @@ define i64 @add64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, %b
     ret i64 %add
@@ -47,6 +69,11 @@ define i8 @add8rm(i8 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x02,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
     %add = add i8 %a, %b
@@ -58,6 +85,11 @@ define i16 @add16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
     %add = add i16 %a, %b
@@ -69,6 +101,11 @@ define i32 @add32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
     %add = add i32 %a, %b
@@ -80,6 +117,11 @@ define i64 @add64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
     %add = add i64 %a, %b
@@ -92,6 +134,12 @@ define i16 @add16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 123
     ret i16 %add
@@ -102,6 +150,11 @@ define i32 @add32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123
     ret i32 %add
@@ -112,6 +165,11 @@ define i64 @add64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123
     ret i64 %add
@@ -122,6 +180,11 @@ define i8 @add8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i8 %a, 123
     ret i8 %add
@@ -134,6 +197,13 @@ define i16 @add16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 1234
     ret i16 %add
@@ -145,6 +215,12 @@ define i32 @add32ri(i32 noundef %a) {
 ; CHECK-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123456
     ret i32 %add
@@ -156,6 +232,12 @@ define i64 @add64ri(i64 noundef %a) {
 ; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123456
     ret i64 %add
@@ -166,6 +248,11 @@ define i8 @add8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, %b
@@ -177,6 +264,11 @@ define i16 @add16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, %b
@@ -188,6 +280,11 @@ define i32 @add32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, %b
@@ -199,6 +296,11 @@ define i64 @add64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, %b
@@ -212,6 +314,13 @@ define i16 @add16mi8(ptr %a) {
 ; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 123
@@ -223,6 +332,11 @@ define i32 @add32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123
@@ -234,6 +348,11 @@ define i64 @add64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123
@@ -245,6 +364,11 @@ define i8 @add8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, 123
@@ -259,6 +383,14 @@ define i16 @add16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 1234
@@ -271,6 +403,12 @@ define i32 @add32mi(ptr %a) {
 ; CHECK-NEXT:    addl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123456
@@ -283,6 +421,12 @@ define i64 @add64mi(ptr %a) {
 ; CHECK-NEXT:    addq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123456
@@ -303,6 +447,15 @@ define i8 @addflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %b)
     ret i8 %add
@@ -317,6 +470,15 @@ define i16 @addflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %b)
     ret i16 %add
@@ -329,6 +491,13 @@ define i32 @addflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
     ret i32 %add
@@ -341,6 +510,13 @@ define i64 @addflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %b)
     ret i64 %add
@@ -355,6 +531,15 @@ define i8 @addflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %t)
@@ -370,6 +555,15 @@ define i16 @addflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %t)
@@ -383,6 +577,13 @@ define i32 @addflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %t)
@@ -396,6 +597,13 @@ define i64 @addflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %t)
@@ -411,6 +619,15 @@ define i16 @addflag16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 123)
     ret i16 %add
@@ -423,6 +640,13 @@ define i32 @addflag32ri8(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123)
     ret i32 %add
@@ -435,6 +659,13 @@ define i64 @addflag64ri8(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123)
     ret i64 %add
@@ -449,6 +680,15 @@ define i8 @addflag8ri(i8 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 123)
     ret i8 %add
@@ -464,6 +704,16 @@ define i16 @addflag16ri(i16 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1234)
     ret i16 %add
@@ -477,6 +727,14 @@ define i32 @addflag32ri(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123456)
     ret i32 %add
@@ -490,6 +748,14 @@ define i64 @addflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456)
     ret i64 %add
@@ -507,6 +773,16 @@ define i1 @add64ri_reloc(i16 %k) {
 ; CHECK-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri_reloc:
+; NF:       # %bb.0:
+; NF-NEXT:    # kill: def $edi killed $edi def $rdi
+; NF-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
+; NF-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; NF-NEXT:    addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
+; NF-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
   %g = getelementptr inbounds i16, ptr @val, i16 %k
   %cmp = icmp ne ptr %g, null
   ret i1 %cmp
@@ -517,6 +793,11 @@ define void @add8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, (%rdi) # encoding: [0x40,0x00,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb %sil, (%rdi) # encoding: [0x40,0x00,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add i8 %t, %b
@@ -529,6 +810,11 @@ define void @add16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw %si, (%rdi) # encoding: [0x66,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw %si, (%rdi) # encoding: [0x66,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add i16 %t, %b
@@ -541,6 +827,11 @@ define void @add32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, (%rdi) # encoding: [0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl %esi, (%rdi) # encoding: [0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add i32 %t, %b
@@ -553,6 +844,11 @@ define void @add64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add i64 %t, %b
@@ -565,6 +861,11 @@ define void @add8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, (%rdi) # encoding: [0x80,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $123, (%rdi) # encoding: [0x80,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, 123
@@ -578,6 +879,12 @@ define void @add16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 1234
@@ -591,6 +898,12 @@ define void @add32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123456
@@ -604,6 +917,12 @@ define void @add64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index af8f4119ac054d..58f54fbe50a524 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: and8rr:
@@ -7,6 +8,12 @@ define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, %b
     ret i8 %and
@@ -18,6 +25,12 @@ define i16 @and16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, %b
     ret i16 %and
@@ -28,6 +41,11 @@ define i32 @and32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, %b
     ret i32 %and
@@ -38,6 +56,11 @@ define i64 @and64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, %b
     ret i64 %and
@@ -48,6 +71,11 @@ define i8 @and8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x22,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %and = and i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @and16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %and = and i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @and32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %and = and i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @and64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %and = and i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @and16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 123
     ret i16 %and
@@ -103,6 +152,11 @@ define i32 @and32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, 123
     ret i32 %and
@@ -113,6 +167,11 @@ define i64 @and64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, 123
     ret i64 %and
@@ -123,6 +182,11 @@ define i8 @and8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, 123
     ret i8 %and
@@ -135,6 +199,13 @@ define i16 @and16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 1234
     ret i16 %and
@@ -146,6 +217,12 @@ define i32 @and32ri(i32 noundef %a) {
 ; CHECK-NEXT:    andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, 123456
     ret i32 %and
@@ -157,6 +234,12 @@ define i64 @and64ri(i64 noundef %a) {
 ; CHECK-NEXT:    andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, 123456
     ret i64 %and
@@ -167,6 +250,11 @@ define i8 @and8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @and16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @and32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @and64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @and16mi8(ptr %a) {
 ; CHECK-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @and32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123
@@ -236,6 +351,12 @@ define i64 @and64mi8(ptr %a) {
 ; CHECK-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; CHECK-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123
@@ -247,6 +368,11 @@ define i8 @and8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, 123
@@ -261,6 +387,14 @@ define i16 @and16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x25,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 1234
@@ -273,6 +407,12 @@ define i32 @and32mi(ptr %a) {
 ; CHECK-NEXT:    andl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123456
@@ -286,6 +426,13 @@ define i64 @and64mi(ptr %a) {
 ; CHECK-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123456
@@ -303,6 +450,15 @@ define i1 @andflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    andb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x20,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -319,6 +475,15 @@ define i1 @andflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    andw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x21,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -334,6 +499,14 @@ define i1 @andflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    andl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x21,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -348,6 +521,14 @@ define i1 @andflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    andq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -363,6 +544,15 @@ define i1 @andflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
@@ -380,6 +570,15 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
@@ -396,6 +595,14 @@ define i1 @andflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    andl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x23,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = and i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -411,6 +618,14 @@ define i1 @andflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    andq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x23,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = and i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -426,6 +641,14 @@ define i1 @andflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xe7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -442,6 +665,15 @@ define i1 @andflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xe7,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -458,6 +690,15 @@ define i1 @andflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -473,6 +714,15 @@ define i1 @andflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -487,6 +737,14 @@ define i1 @andflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xe7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -502,6 +760,14 @@ define i1 @andflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xe7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -516,6 +782,14 @@ define i1 @andflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xe7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -527,6 +801,11 @@ define void @and8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb %sil, (%rdi) # encoding: [0x40,0x20,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andb %sil, (%rdi) # encoding: [0x40,0x20,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, %b
@@ -539,6 +818,11 @@ define void @and16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw %si, (%rdi) # encoding: [0x66,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andw %si, (%rdi) # encoding: [0x66,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, %b
@@ -551,6 +835,11 @@ define void @and32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, (%rdi) # encoding: [0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andl %esi, (%rdi) # encoding: [0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, %b
@@ -563,6 +852,11 @@ define void @and64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, %b
@@ -575,6 +869,11 @@ define void @and8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, (%rdi) # encoding: [0x80,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andb $123, (%rdi) # encoding: [0x80,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, 123
@@ -588,6 +887,12 @@ define void @and16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 1234
@@ -601,6 +906,12 @@ define void @and32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123456
@@ -614,6 +925,12 @@ define void @and64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index d8bef886e234f9..626904a7a692c1 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -1,4 +1,5 @@
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr,+nf -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD-NF %s
 
 ...
 ---
@@ -46,7 +47,8 @@ name:            ndd_2_non_ndd_incommutable
 body:             |
   bb.0.entry:
     liveins: $rdi, $rsi
-    ; CHECK: subq    %rax, %rsi, %rax                # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6]
+    ; NDD:     subq    %rax, %rsi, %rax              # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6]
+    ; NDD-NF: {nf} subq    %rax, %rsi, %rax          # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xc6]
     renamable $rax = ADD64rr_ND killed renamable $rdi, renamable $rsi, implicit-def dead $eflags
     renamable $rax = SUB64rr_ND killed renamable $rsi, killed renamable $rax, implicit-def dead $eflags
     RET64 $rax
@@ -55,7 +57,8 @@ body:             |
 name:            ndd_2_non_ndd_mem
 body:             |
   bb.0.entry:
-    ; CHECK: addq    $123456, (%rax), %rax           # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00]
+    ; NDD:    addq    $123456, (%rax), %rax          # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00]
+    ; NDD-NF: {nf} addq $123456, (%rax), %rax        # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x00,0x40,0xe2,0x01,0x00]
     renamable $rax = MOV64rm $noreg, 1, $noreg, 0, $fs
     renamable $rax = nsw ADD64mi32_ND killed renamable $rax, 1, $noreg, 0, $noreg, 123456, implicit-def dead $eflags
     RET64 $rax
@@ -88,5 +91,20 @@ body:             |
     ; CHECK: bswapq  %rax                            # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0xc8]
     renamable $rax = MOVBE64rr killed renamable $rax
     RET64 killed $rax
-
+...
+---
+name:            non_nf_2_nf
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $r16
+    ; CHECK:  addq %r16, %rdi                        # encoding: [0xd5,0x48,0x01,0xc7]
+    ; NDD:    xorq %r16, %rdi, %rax                  # encoding: [0x62,0xe4,0xfc,0x18,0x31,0xc7]
+    ; NDD-NF: {nf} xorq %r16, %rdi, %rax             # EVEX TO EVEX Compression encoding: [0x62,0xe4,0xfc,0x1c,0x31,0xc7]
+    ; CHECK:  addq %r16, %rax, %rdi                  # encoding: [0x62,0xe4,0xc4,0x18,0x01,0xc0]
+    ; CHECK:  adcq %rdi, %r16, %rax                  # encoding: [0x62,0xfc,0xfc,0x18,0x11,0xf8]
+    $rdi = ADD64rr $rdi, $r16, implicit-def dead $eflags
+    $rax = XOR64rr_ND $rdi, $r16, implicit-def dead $eflags
+    $rdi = ADD64rr_ND $rax, $r16, implicit-def $eflags
+    $rax = ADC64rr_ND $r16, $rdi, implicit-def dead $eflags, implicit $eflags
+    RET64 $rax
 ...
diff --git a/llvm/test/CodeGen/X86/apx/dec.ll b/llvm/test/CodeGen/X86/apx/dec.ll
index fcb2cae3b5cad8..a18ed2ace603ab 100644
--- a/llvm/test/CodeGen/X86/apx/dec.ll
+++ b/llvm/test/CodeGen/X86/apx/dec.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @dec8r(i8 noundef %a) {
 ; CHECK-LABEL: dec8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decb %dil, %al
+; NF-NEXT:    retq
 entry:
   %dec = sub i8 %a, 1
   ret i8 %dec
@@ -17,6 +23,12 @@ define i16 @dec16r(i16 noundef %a) {
 ; CHECK-NEXT:    decl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %dec = sub i16 %a, 1
   ret i16 %dec
@@ -27,6 +39,11 @@ define i32 @dec32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %dec = sub i32 %a, 1
   ret i32 %dec
@@ -37,6 +54,11 @@ define i64 @dec64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %dec = sub i64 %a, 1
   ret i64 %dec
@@ -47,6 +69,11 @@ define i8 @dec8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %dec = sub i8 %a, 1
@@ -60,6 +87,13 @@ define i16 @dec16m(ptr %ptr) {
 ; CHECK-NEXT:    decl %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    decl %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %dec = sub i16 %a, 1
@@ -71,6 +105,11 @@ define i32 @dec32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %dec = sub i32 %a, 1
@@ -82,6 +121,11 @@ define i64 @dec64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %dec = sub i64 %a, 1
@@ -93,6 +137,11 @@ define void @dec8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %dec = sub i8 %a, 1
@@ -105,6 +154,11 @@ define void @dec16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %dec = sub i16 %a, 1
@@ -117,6 +171,11 @@ define void @dec32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %dec = sub i32 %a, 1
@@ -129,6 +188,11 @@ define void @dec64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %dec = sub i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/apx/imul.ll b/llvm/test/CodeGen/X86/apx/imul.ll
index 2963a6477be4c1..d97b2c0baec5e2 100644
--- a/llvm/test/CodeGen/X86/apx/imul.ll
+++ b/llvm/test/CodeGen/X86/apx/imul.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i16 @mul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: mul16rr:
@@ -7,6 +8,12 @@ define i16 @mul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %mul = mul i16 %a, %b
   ret i16 %mul
@@ -17,6 +24,11 @@ define i32 @mul32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    retq
 entry:
   %mul = mul i32 %a, %b
   ret i32 %mul
@@ -27,6 +39,11 @@ define i64 @mul64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq %rsi, %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq %rsi, %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %mul = mul i64 %a, %b
   ret i64 %mul
@@ -37,6 +54,11 @@ define i16 @smul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw %si, %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw %si, %di, %ax
+; NF-NEXT:    retq
 entry:
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
   %mul = extractvalue {i16, i1} %t, 0
@@ -48,6 +70,11 @@ define i32 @smul32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    retq
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
   %mul = extractvalue {i32, i1} %t, 0
@@ -59,6 +86,11 @@ define i64 @smul64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq %rsi, %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq %rsi, %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %mul = extractvalue {i64, i1} %t, 0
@@ -70,6 +102,11 @@ define i16 @mul16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw (%rsi), %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
   %mul = mul i16 %a, %b
@@ -81,6 +118,11 @@ define i32 @mul32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull (%rsi), %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
   %mul = mul i32 %a, %b
@@ -92,6 +134,11 @@ define i64 @mul64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
   %mul = mul i64 %a, %b
@@ -103,6 +150,11 @@ define i16 @smul16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw (%rsi), %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
@@ -115,6 +167,11 @@ define i32 @smul32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull (%rsi), %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
@@ -127,6 +184,11 @@ define i64 @smul64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index a9c6d740cf2cee..8d31badb997797 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @inc8r(i8 noundef %a) {
 ; CHECK-LABEL: inc8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incb %dil, %al
+; NF-NEXT:    retq
 entry:
   %inc = add i8 %a, 1
   ret i8 %inc
@@ -17,6 +23,12 @@ define i16 @inc16r(i16 noundef %a) {
 ; CHECK-NEXT:    incl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = add i16 %a, 1
   ret i16 %inc
@@ -27,6 +39,11 @@ define i32 @inc32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %inc = add i32 %a, 1
   ret i32 %inc
@@ -37,6 +54,11 @@ define i64 @inc64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %inc = add i64 %a, 1
   ret i64 %inc
@@ -47,6 +69,11 @@ define i8 @inc8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %inc = add i8 %a, 1
@@ -60,6 +87,13 @@ define i16 @inc16m(ptr %ptr) {
 ; CHECK-NEXT:    incl %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    incl %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %inc = add i16 %a, 1
@@ -71,6 +105,11 @@ define i32 @inc32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %inc = add i32 %a, 1
@@ -82,6 +121,11 @@ define i64 @inc64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %inc = add i64 %a, 1
@@ -97,6 +141,15 @@ define i8 @uinc8r(i8 noundef %a) {
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incb %dil, %al
+; NF-NEXT:    movzbl %al, %eax
+; NF-NEXT:    movl $255, %ecx
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = call i8 @llvm.uadd.sat.i8(i8 %a, i8 1)
   ret i8 %inc
@@ -110,6 +163,14 @@ define i16 @uinc16r(i16 noundef %a) {
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incw %di, %ax
+; NF-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1)
   ret i16 %inc
@@ -122,6 +183,13 @@ define i32 @uinc32r(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incl %edi, %eax
+; NF-NEXT:    movl $-1, %ecx
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    retq
 entry:
   %inc = call i32 @llvm.uadd.sat.i32(i32 %a, i32 1)
   ret i32 %inc
@@ -134,6 +202,13 @@ define i64 @uinc64r(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmoveq %rcx, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incq %rdi, %rax
+; NF-NEXT:    movq $-1, %rcx
+; NF-NEXT:    cmoveq %rcx, %rax
+; NF-NEXT:    retq
 entry:
   %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1)
   ret i64 %inc
@@ -149,6 +224,11 @@ define void @inc8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %inc = add i8 %a, 1
@@ -161,6 +241,11 @@ define void @inc16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %inc = add i16 %a, 1
@@ -173,6 +258,11 @@ define void @inc32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %inc = add i32 %a, 1
@@ -185,6 +275,11 @@ define void @inc64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %inc = add i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/apx/neg.ll b/llvm/test/CodeGen/X86/apx/neg.ll
index c1c53fbdaebd82..5e033e33cb8b2a 100644
--- a/llvm/test/CodeGen/X86/apx/neg.ll
+++ b/llvm/test/CodeGen/X86/apx/neg.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @neg8r(i8 noundef %a) {
 ; CHECK-LABEL: neg8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb %dil, %al
+; NF-NEXT:    retq
 entry:
   %neg = sub i8 0, %a
   ret i8 %neg
@@ -17,6 +23,12 @@ define i16 @neg16r(i16 noundef %a) {
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %neg = sub i16 0, %a
   ret i16 %neg
@@ -27,6 +39,11 @@ define i32 @neg32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %neg = sub i32 0, %a
   ret i32 %neg
@@ -37,6 +54,11 @@ define i64 @neg64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %neg = sub i64 0, %a
   ret i64 %neg
@@ -47,6 +69,11 @@ define i8 @neg8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %neg = sub i8 0, %a
@@ -58,6 +85,11 @@ define i16 @neg16m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi), %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %neg = sub i16 0, %a
@@ -69,6 +101,11 @@ define i32 @neg32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %neg = sub i32 0, %a
@@ -80,6 +117,11 @@ define i64 @neg64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %neg = sub i64 0, %a
@@ -91,6 +133,11 @@ define i8 @uneg8r(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb %dil, %al
+; NF-NEXT:    retq
 entry:
   %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a)
   %neg = extractvalue {i8, i1} %t, 0
@@ -103,6 +150,12 @@ define i16 @uneg16r(i16 noundef %a) {
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a)
   %neg = extractvalue {i16, i1} %t, 0
@@ -114,6 +167,11 @@ define i32 @uneg32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a)
   %neg = extractvalue {i32, i1} %t, 0
@@ -125,6 +183,11 @@ define i64 @uneg64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a)
   %neg = extractvalue {i64, i1} %t, 0
@@ -136,6 +199,11 @@ define i8 @uneg8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a)
@@ -148,6 +216,11 @@ define i16 @uneg16m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi), %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a)
@@ -160,6 +233,11 @@ define i32 @uneg32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a)
@@ -172,6 +250,11 @@ define i64 @uneg64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a)
@@ -189,6 +272,11 @@ define void @neg8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %neg = sub i8 0, %a
@@ -201,6 +289,11 @@ define void @neg16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %neg = sub i16 0, %a
@@ -213,6 +306,11 @@ define void @neg32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %neg = sub i32 0, %a
@@ -225,6 +323,11 @@ define void @neg64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %neg = sub i64 0, %a
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index 3d024e962400fa..d404279e14f7ab 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: or8rr:
@@ -7,6 +8,12 @@ define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, %b
     ret i8 %or
@@ -18,6 +25,12 @@ define i16 @or16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, %b
     ret i16 %or
@@ -28,6 +41,11 @@ define i32 @or32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, %b
     ret i32 %or
@@ -38,6 +56,11 @@ define i64 @or64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, %b
     ret i64 %or
@@ -48,6 +71,11 @@ define i8 @or8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0a,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %or = or i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @or16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %or = or i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @or32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %or = or i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @or64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %or = or i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @or16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 123
     ret i16 %or
@@ -103,6 +152,11 @@ define i32 @or32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, 123
     ret i32 %or
@@ -113,6 +167,11 @@ define i64 @or64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, 123
     ret i64 %or
@@ -123,6 +182,11 @@ define i8 @or8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, 123
     ret i8 %or
@@ -135,6 +199,13 @@ define i16 @or16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 1234
     ret i16 %or
@@ -146,6 +217,12 @@ define i32 @or32ri(i32 noundef %a) {
 ; CHECK-NEXT:    orl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, 123456
     ret i32 %or
@@ -157,6 +234,12 @@ define i64 @or64ri(i64 noundef %a) {
 ; CHECK-NEXT:    orq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, 123456
     ret i64 %or
@@ -167,6 +250,11 @@ define i8 @or8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @or16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @or32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @or64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @or16mi8(ptr %a) {
 ; CHECK-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @or32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123
@@ -235,6 +350,11 @@ define i64 @or64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123
@@ -246,6 +366,11 @@ define i8 @or8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, 123
@@ -260,6 +385,14 @@ define i16 @or16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 1234
@@ -272,6 +405,12 @@ define i32 @or32mi(ptr %a) {
 ; CHECK-NEXT:    orl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123456
@@ -284,6 +423,12 @@ define i64 @or64mi(ptr %a) {
 ; CHECK-NEXT:    orq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123456
@@ -301,6 +446,15 @@ define i1 @orflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    orb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x08,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -317,6 +471,15 @@ define i1 @orflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    orw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x09,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -332,6 +495,14 @@ define i1 @orflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    orl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x09,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -346,6 +517,14 @@ define i1 @orflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    orq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x09,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -361,6 +540,15 @@ define i1 @orflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
@@ -378,6 +566,15 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
@@ -394,6 +591,14 @@ define i1 @orflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    orl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x0b,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = or i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -409,6 +614,14 @@ define i1 @orflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    orq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x0b,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = or i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -424,6 +637,14 @@ define i1 @orflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xcf,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -440,6 +661,15 @@ define i1 @orflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xcf,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -456,6 +686,15 @@ define i1 @orflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -471,6 +710,15 @@ define i1 @orflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -485,6 +733,14 @@ define i1 @orflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xcf,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -500,6 +756,14 @@ define i1 @orflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xcf,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -514,6 +778,14 @@ define i1 @orflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xcf,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -525,6 +797,11 @@ define void @or8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb %sil, (%rdi) # encoding: [0x40,0x08,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orb %sil, (%rdi) # encoding: [0x40,0x08,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, %b
@@ -537,6 +814,11 @@ define void @or16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw %si, (%rdi) # encoding: [0x66,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orw %si, (%rdi) # encoding: [0x66,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, %b
@@ -549,6 +831,11 @@ define void @or32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, (%rdi) # encoding: [0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orl %esi, (%rdi) # encoding: [0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, %b
@@ -561,6 +848,11 @@ define void @or64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, %b
@@ -573,6 +865,11 @@ define void @or8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, 123
@@ -586,6 +883,12 @@ define void @or16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 1234
@@ -599,6 +902,12 @@ define void @or32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123456
@@ -612,6 +921,12 @@ define void @or64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
index 869caf932ff920..35b6cb27254b2e 100644
--- a/llvm/test/CodeGen/X86/apx/shl.ll
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @shl8ri(i8 noundef %a) {
 ; CHECK-LABEL: shl8ri:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, 4
   ret i8 %shl
@@ -17,6 +23,12 @@ define i16 @shl16ri(i16 noundef %a) {
 ; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 4
   ret i16 %shl
@@ -27,6 +39,11 @@ define i32 @shl32ri(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, 4
   ret i32 %shl
@@ -37,6 +54,11 @@ define i64 @shl64ri(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, 4
   ret i64 %shl
@@ -48,6 +70,12 @@ define i8 @shl8m1(ptr %ptr) {
 ; CHECK-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; CHECK-NEXT:    addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 1
@@ -61,6 +89,13 @@ define i16 @shl16m1(ptr %ptr) {
 ; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 1
@@ -73,6 +108,12 @@ define i32 @shl32m1(ptr %ptr) {
 ; CHECK-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 1
@@ -85,6 +126,12 @@ define i64 @shl64m1(ptr %ptr) {
 ; CHECK-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; CHECK-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 1
@@ -98,6 +145,13 @@ define i8 @shl8mcl(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, %cl
@@ -111,6 +165,13 @@ define i8 @shl8mcl_mask(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shamt = and i8 %cl, 31
@@ -127,6 +188,15 @@ define i16 @shl16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, %cl
@@ -142,6 +212,15 @@ define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shamt = and i16 %cl, 31
@@ -156,6 +235,13 @@ define i32 @shl32mcl(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, %cl
@@ -169,6 +255,13 @@ define i32 @shl32mcl_mask(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shamt = and i32 %cl, 31
@@ -183,6 +276,13 @@ define i64 @shl64mcl(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, %cl
@@ -196,6 +296,13 @@ define i64 @shl64mcl_mask(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shamt = and i64 %cl, 63
@@ -208,6 +315,11 @@ define i8 @shl8mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 4
@@ -221,6 +333,13 @@ define i16 @shl16mi(ptr %ptr) {
 ; CHECK-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 4
@@ -232,6 +351,11 @@ define i32 @shl32mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 4
@@ -243,6 +367,11 @@ define i64 @shl64mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 4
@@ -254,6 +383,11 @@ define i8 @shl8r1(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %dil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, 1
   ret i8 %shl
@@ -265,6 +399,12 @@ define i16 @shl16r1(i16 noundef %a) {
 ; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 1
   ret i16 %shl
@@ -275,6 +415,11 @@ define i32 @shl32r1(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, 1
   ret i32 %shl
@@ -285,6 +430,11 @@ define i64 @shl64r1(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rdi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rdi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, 1
   ret i64 %shl
@@ -297,6 +447,13 @@ define i8 @shl8rcl(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, %cl
   ret i8 %shl
@@ -309,6 +466,13 @@ define i8 @shl8rcl_mask(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i8 %cl, 31
   %shl = shl i8 %a, %shamt
@@ -323,6 +487,14 @@ define i16 @shl16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, %cl
   ret i16 %shl
@@ -336,6 +508,14 @@ define i16 @shl16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
   %shl = shl i16 %a, %shamt
@@ -349,6 +529,13 @@ define i32 @shl32rcl(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, %cl
   ret i32 %shl
@@ -361,6 +548,13 @@ define i32 @shl32rcl_mask(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i32 %cl, 31
   %shl = shl i32 %a, %shamt
@@ -374,6 +568,13 @@ define i64 @shl64rcl(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, %cl
   ret i64 %shl
@@ -386,6 +587,13 @@ define i64 @shl64rcl_mask(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i64 %cl, 63
   %shl = shl i64 %a, %shamt
@@ -397,6 +605,11 @@ define void @shl8m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb (%rdi) # encoding: [0xd0,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlb (%rdi) # encoding: [0xd0,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 1
@@ -409,6 +622,11 @@ define void @shl16m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlw (%rdi) # encoding: [0x66,0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlw (%rdi) # encoding: [0x66,0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 1
@@ -421,6 +639,11 @@ define void @shl32m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll (%rdi) # encoding: [0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shll (%rdi) # encoding: [0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 1
@@ -433,6 +656,11 @@ define void @shl64m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq (%rdi) # encoding: [0x48,0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlq (%rdi) # encoding: [0x48,0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 1
@@ -445,6 +673,11 @@ define void @shl8mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 4
@@ -457,6 +690,11 @@ define void @shl16mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 4
@@ -469,6 +707,11 @@ define void @shl32mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, (%rdi) # encoding: [0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shll $4, (%rdi) # encoding: [0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 4
@@ -481,6 +724,11 @@ define void @shl64mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 4
@@ -495,6 +743,13 @@ define void @shl8mcl_legacy(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi) # encoding: [0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shlb %cl, (%rdi) # encoding: [0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, %cl
@@ -509,6 +764,13 @@ define void @shl16mcl_legacy(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, %cl
@@ -523,6 +785,13 @@ define void @shl32mcl_legacy(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi) # encoding: [0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, (%rdi) # encoding: [0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, %cl
@@ -537,6 +806,13 @@ define void @shl64mcl_legacy(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, %cl
diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll
index a7e02d8586f49d..b5b91b02fedffb 100644
--- a/llvm/test/CodeGen/X86/apx/shr.ll
+++ b/llvm/test/CodeGen/X86/apx/shr.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @shr8m1(ptr %ptr) {
 ; CHECK-LABEL: shr8m1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 1
@@ -19,6 +25,13 @@ define i16 @shr16m1(ptr %ptr) {
 ; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 1
@@ -30,6 +43,11 @@ define i32 @shr32m1(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 1
@@ -41,6 +59,11 @@ define i64 @shr64m1(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 1
@@ -54,6 +77,13 @@ define i8 @shr8mcl(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, %cl
@@ -67,6 +97,13 @@ define i8 @shr8mcl_mask(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shamt = and i8 %cl, 31
@@ -83,6 +120,15 @@ define i16 @shr16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, %cl
@@ -98,6 +144,15 @@ define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shamt = and i16 %cl, 31
@@ -112,6 +167,13 @@ define i32 @shr32mcl(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, %cl
@@ -125,6 +187,13 @@ define i32 @shr32mcl_mask(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shamt = and i32 %cl, 31
@@ -139,6 +208,13 @@ define i64 @shr64mcl(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, %cl
@@ -152,6 +228,13 @@ define i64 @shr64mcl_mask(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shamt = and i64 %cl, 63
@@ -164,6 +247,11 @@ define i8 @shr8mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 4
@@ -177,6 +265,13 @@ define i16 @shr16mi(ptr %ptr) {
 ; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 4
@@ -188,6 +283,11 @@ define i32 @shr32mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 4
@@ -199,6 +299,11 @@ define i64 @shr64mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 4
@@ -210,6 +315,11 @@ define i8 @shr8r1(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, 1
   ret i8 %shr
@@ -222,6 +332,13 @@ define i16 @shr16r1(i16 noundef %a) {
 ; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 1
   ret i16 %shr
@@ -232,6 +349,11 @@ define i32 @shr32r1(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, 1
   ret i32 %shr
@@ -242,6 +364,11 @@ define i64 @shr64r1(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, 1
   ret i64 %shr
@@ -254,6 +381,13 @@ define i8 @shr8rcl(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, %cl
   ret i8 %shr
@@ -266,6 +400,13 @@ define i8 @shr8rcl_mask(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i8 %cl, 31
   %shr = lshr i8 %a, %shamt
@@ -281,6 +422,15 @@ define i16 @shr16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, %cl
   ret i16 %shr
@@ -295,6 +445,15 @@ define i16 @shr16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
   %shr = lshr i16 %a, %shamt
@@ -308,6 +467,13 @@ define i32 @shr32rcl(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, %cl
   ret i32 %shr
@@ -320,6 +486,13 @@ define i32 @shr32rcl_mask(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i32 %cl, 31
   %shr = lshr i32 %a, %shamt
@@ -333,6 +506,13 @@ define i64 @shr64rcl(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, %cl
   ret i64 %shr
@@ -345,6 +525,13 @@ define i64 @shr64rcl_mask(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i64 %cl, 63
   %shr = lshr i64 %a, %shamt
@@ -356,6 +543,11 @@ define i8 @shr8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, 4
   ret i8 %shr
@@ -368,6 +560,13 @@ define i16 @shr16ri(i16 noundef %a) {
 ; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 4
   ret i16 %shr
@@ -378,6 +577,11 @@ define i32 @shr32ri(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, 4
   ret i32 %shr
@@ -388,6 +592,11 @@ define i64 @shr64ri(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, 4
   ret i64 %shr
@@ -398,6 +607,11 @@ define void @shr8m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb (%rdi) # encoding: [0xd0,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrb (%rdi) # encoding: [0xd0,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 1
@@ -410,6 +624,11 @@ define void @shr16m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrw (%rdi) # encoding: [0x66,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrw (%rdi) # encoding: [0x66,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 1
@@ -422,6 +641,11 @@ define void @shr32m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl (%rdi) # encoding: [0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrl (%rdi) # encoding: [0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 1
@@ -434,6 +658,11 @@ define void @shr64m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq (%rdi) # encoding: [0x48,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrq (%rdi) # encoding: [0x48,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 1
@@ -446,6 +675,11 @@ define void @shr8mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 4
@@ -458,6 +692,11 @@ define void @shr16mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 4
@@ -470,6 +709,11 @@ define void @shr32mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 4
@@ -482,6 +726,11 @@ define void @shr64mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 4
@@ -496,6 +745,13 @@ define void @shr8mcl_legacy(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi) # encoding: [0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrb %cl, (%rdi) # encoding: [0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, %cl
@@ -510,6 +766,13 @@ define void @shr16mcl_legacy(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, %cl
@@ -524,6 +787,13 @@ define void @shr32mcl_legacy(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi) # encoding: [0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, (%rdi) # encoding: [0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, %cl
@@ -538,6 +808,13 @@ define void @shr64mcl_legacy(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, %cl
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index be0914c90b9faf..a38d09587ba919 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @sub8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: sub8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i8 %a, %b
     ret i8 %sub
@@ -17,6 +23,12 @@ define i16 @sub16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, %b
     ret i16 %sub
@@ -27,6 +39,11 @@ define i32 @sub32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, %b
     ret i32 %sub
@@ -37,6 +54,11 @@ define i64 @sub64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, %b
     ret i64 %sub
@@ -47,6 +69,11 @@ define i8 @sub8rm(i8 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2a,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
     %sub = sub i8 %a, %b
@@ -58,6 +85,11 @@ define i16 @sub16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
     %sub = sub i16 %a, %b
@@ -69,6 +101,11 @@ define i32 @sub32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
     %sub = sub i32 %a, %b
@@ -80,6 +117,11 @@ define i64 @sub64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
     %sub = sub i64 %a, %b
@@ -92,6 +134,12 @@ define i16 @sub16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, -128
     ret i16 %sub
@@ -102,6 +150,11 @@ define i32 @sub32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, -128
     ret i32 %sub
@@ -112,6 +165,11 @@ define i64 @sub64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $-128, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-128, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -128
     ret i64 %sub
@@ -122,6 +180,11 @@ define i8 @sub8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $-123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i8 %a, 123
     ret i8 %sub
@@ -134,6 +197,13 @@ define i16 @sub16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, 1234
     ret i16 %sub
@@ -145,6 +215,12 @@ define i32 @sub32ri(i32 noundef %a) {
 ; CHECK-NEXT:    addl $-123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, 123456
     ret i32 %sub
@@ -156,6 +232,12 @@ define i64 @sub64ri(i64 noundef %a) {
 ; CHECK-NEXT:    subq $-2147483648, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,0x00,0x00,0x00,0x80]
 ; CHECK-NEXT:    # imm = 0x80000000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-2147483648, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xef,0x00,0x00,0x00,0x80]
+; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -2147483648
     ret i64 %sub
@@ -166,6 +248,11 @@ define i8 @sub8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, %b
@@ -179,6 +266,13 @@ define i16 @sub16mr(ptr %a, i16 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, %b
@@ -190,6 +284,11 @@ define i32 @sub32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, %b
@@ -201,6 +300,11 @@ define i64 @sub64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, %b
@@ -214,6 +318,13 @@ define i16 @sub16mi8(ptr %a) {
 ; CHECK-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, -128
@@ -225,6 +336,11 @@ define i32 @sub32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $-128, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x2f,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, -128
@@ -236,6 +352,11 @@ define i64 @sub64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $-128, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x2f,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-128, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, -128
@@ -247,6 +368,11 @@ define i8 @sub8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $-123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, 123
@@ -261,6 +387,14 @@ define i16 @sub16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, 1234
@@ -273,6 +407,12 @@ define i32 @sub32mi(ptr %a) {
 ; CHECK-NEXT:    addl $-123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, 123456
@@ -285,6 +425,12 @@ define i64 @sub64mi(ptr %a) {
 ; CHECK-NEXT:    subq $-2147483648, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x2f,0x00,0x00,0x00,0x80]
 ; CHECK-NEXT:    # imm = 0x80000000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-2147483648, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x2f,0x00,0x00,0x00,0x80]
+; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, -2147483648
@@ -305,6 +451,15 @@ define i8 @subflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb %sil, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x28,0xf7]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %b)
     ret i8 %sub
@@ -318,6 +473,14 @@ define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %b)
     ret i16 %sub
@@ -330,6 +493,13 @@ define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b)
     ret i32 %sub
@@ -342,6 +512,13 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b)
     ret i64 %sub
@@ -356,6 +533,15 @@ define i8 @subflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %t)
@@ -370,6 +556,14 @@ define i16 @subflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %t)
@@ -383,6 +577,13 @@ define i32 @subflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %t)
@@ -396,6 +597,13 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %t)
@@ -410,6 +618,14 @@ define i16 @subflag16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 123)
     ret i16 %sub
@@ -422,6 +638,13 @@ define i32 @subflag32ri8(i32 noundef %a) {
 ; CHECK-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123)
     ret i32 %sub
@@ -434,6 +657,13 @@ define i64 @subflag64ri8(i64 noundef %a) {
 ; CHECK-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123)
     ret i64 %sub
@@ -448,6 +678,15 @@ define i8 @subflag8ri(i8 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb $123, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xef,0x7b]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 123)
     ret i8 %sub
@@ -462,6 +701,15 @@ define i16 @subflag16ri(i16 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 1234)
     ret i16 %sub
@@ -475,6 +723,14 @@ define i32 @subflag32ri(i32 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456)
     ret i32 %sub
@@ -488,6 +744,14 @@ define i64 @subflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456)
     ret i64 %sub
@@ -513,6 +777,22 @@ define void @sub64ri_reloc(i64 %val) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:  .LBB41_2: # %f
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri_reloc:
+; NF:       # %bb.0:
+; NF-NEXT:    cmpq $val, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: val, kind: reloc_signed_4byte
+; NF-NEXT:    jbe .LBB41_2 # encoding: [0x76,A]
+; NF-NEXT:    # fixup A - offset: 1, value: .LBB41_2-1, kind: FK_PCRel_1
+; NF-NEXT:  # %bb.1: # %t
+; NF-NEXT:    pushq %rax # encoding: [0x50]
+; NF-NEXT:    .cfi_def_cfa_offset 16
+; NF-NEXT:    callq f@PLT # encoding: [0xe8,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 1, value: f@PLT-4, kind: FK_PCRel_4
+; NF-NEXT:    popq %rax # encoding: [0x58]
+; NF-NEXT:    .cfi_def_cfa_offset 8
+; NF-NEXT:  .LBB41_2: # %f
+; NF-NEXT:    retq # encoding: [0xc3]
   %cmp = icmp ugt i64 %val, ptrtoint (ptr @val to i64)
   br i1 %cmp, label %t, label %f
 
@@ -529,6 +809,11 @@ define void @sub8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, (%rdi) # encoding: [0x40,0x28,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subb %sil, (%rdi) # encoding: [0x40,0x28,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub i8 %t, %b
@@ -541,6 +826,11 @@ define void @sub16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subw %si, (%rdi) # encoding: [0x66,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subw %si, (%rdi) # encoding: [0x66,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub i16 %t, %b
@@ -553,6 +843,11 @@ define void @sub32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, (%rdi) # encoding: [0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subl %esi, (%rdi) # encoding: [0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub i32 %t, %b
@@ -565,6 +860,11 @@ define void @sub64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub i64 %t, %b
@@ -577,6 +877,11 @@ define void @sub8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, (%rdi) # encoding: [0x80,0x07,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $-123, (%rdi) # encoding: [0x80,0x07,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, 123
@@ -590,6 +895,12 @@ define void @sub16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, 1234
@@ -603,6 +914,12 @@ define void @sub32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, 123456
@@ -616,6 +933,12 @@ define void @sub64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index d203fbb02782ab..436b16b4292dfc 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: xor8rr:
@@ -7,6 +8,12 @@ define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, %b
     ret i8 %xor
@@ -18,6 +25,12 @@ define i16 @xor16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, %b
     ret i16 %xor
@@ -28,6 +41,11 @@ define i32 @xor32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, %b
     ret i32 %xor
@@ -38,6 +56,11 @@ define i64 @xor64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, %b
     ret i64 %xor
@@ -48,6 +71,11 @@ define i8 @xor8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %xor = xor i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @xor16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %xor = xor i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @xor32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %xor = xor i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @xor64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %xor = xor i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @xor16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 123
     ret i16 %xor
@@ -103,6 +152,11 @@ define i32 @xor32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, 123
     ret i32 %xor
@@ -113,6 +167,11 @@ define i64 @xor64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, 123
     ret i64 %xor
@@ -123,6 +182,11 @@ define i8 @xor8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, 123
     ret i8 %xor
@@ -135,6 +199,13 @@ define i16 @xor16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 1234
     ret i16 %xor
@@ -146,6 +217,12 @@ define i32 @xor32ri(i32 noundef %a) {
 ; CHECK-NEXT:    xorl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, 123456
     ret i32 %xor
@@ -157,6 +234,12 @@ define i64 @xor64ri(i64 noundef %a) {
 ; CHECK-NEXT:    xorq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, 123456
     ret i64 %xor
@@ -167,6 +250,11 @@ define i8 @xor8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @xor16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @xor32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @xor64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @xor16mi8(ptr %a) {
 ; CHECK-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @xor32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, 123
@@ -235,6 +350,11 @@ define i64 @xor64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, 123
@@ -246,6 +366,11 @@ define i8 @xor8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, 123
@@ -260,6 +385,14 @@ define i16 @xor16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x35,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, 1234
@@ -272,6 +405,12 @@ define i32 @xor32mi(ptr %a) {
 ; CHECK-NEXT:    xorl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, 123456
@@ -284,6 +423,12 @@ define i64 @xor64mi(ptr %a) {
 ; CHECK-NEXT:    xorq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, 123456
@@ -301,6 +446,15 @@ define i1 @xorflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -317,6 +471,15 @@ define i1 @xorflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe]
+; NF-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -332,6 +495,14 @@ define i1 @xorflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x31,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -346,6 +517,14 @@ define i1 @xorflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x31,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -361,6 +540,15 @@ define i1 @xorflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x37]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
@@ -378,6 +566,15 @@ define i1 @xorflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x37]
+; NF-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
@@ -394,6 +591,14 @@ define i1 @xorflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x33,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = xor i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -409,6 +614,14 @@ define i1 @xorflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x33,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = xor i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -424,6 +637,14 @@ define i1 @xorflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -440,6 +661,15 @@ define i1 @xorflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xf7,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -456,6 +686,15 @@ define i1 @xorflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -471,6 +710,15 @@ define i1 @xorflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -485,6 +733,14 @@ define i1 @xorflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -500,6 +756,14 @@ define i1 @xorflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xf7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -514,6 +778,14 @@ define i1 @xorflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xf7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -525,6 +797,11 @@ define void @xor8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, %b
@@ -537,6 +814,11 @@ define void @xor16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw %si, (%rdi) # encoding: [0x66,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorw %si, (%rdi) # encoding: [0x66,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, %b
@@ -549,6 +831,11 @@ define void @xor32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, (%rdi) # encoding: [0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %esi, (%rdi) # encoding: [0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, %b
@@ -561,6 +848,11 @@ define void @xor64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, %b

From 76e1a535fd7d8e9451414c76b55d82166c4c5409 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 29 May 2024 09:45:51 +0200
Subject: [PATCH 073/230] [llvm][bazel] Fix llvm-config after
 3613b2683107bd60fda6d9348623be0686f6d7e3.

---
 utils/bazel/llvm_configs/llvm-config.h.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 6605ea60df99e1..629977cc11d683 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -198,4 +198,7 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
+/* Define if logf128 is available */
+#cmakedefine LLVM_HAS_LOGF128
+
 #endif

From 1c6746e2db58ab7c7a5fb44cd5efa852ce932f84 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 29 May 2024 08:56:41 +0100
Subject: [PATCH 074/230] [VectorCombine] Add support for zext/sext/trunc to
 shuffleToIdentity (#92696)

This is one of the simple additions to shuffleToIdentity that help it
look through intermediate zext/sext instructions.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  5 +-
 .../AArch64/shuffletoidentity.ll              | 51 ++++---------------
 2 files changed, 15 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 056f0d6b3ee6c5..c3c4ee8479766e 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1746,6 +1746,9 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     return Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
   if (auto *SI = dyn_cast<SelectInst>(I))
     return Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
+  if (auto *CI = dyn_cast<CastInst>(I))
+    return Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), Ops[0],
+                              DstTy);
   if (II)
     return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
   assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
@@ -1847,7 +1850,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
         isa<CmpInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
-    } else if (isa<UnaryOperator>(FrontV)) {
+    } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
     } else if (isa<SelectInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 5cbda8a1e112ea..62fb0e6c7c11d9 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -570,19 +570,10 @@ define <8 x i16> @not_bitcast2(<4 x i32> %x, <8 x i16> %y) {
 
 define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
 ; CHECK-LABEL: @exttrunc(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i32> [[AB]] to <4 x i64>
-; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i32> [[AT]] to <4 x i64>
-; CHECK-NEXT:    [[BB1:%.*]] = sext <4 x i32> [[BB]] to <4 x i64>
-; CHECK-NEXT:    [[BT1:%.*]] = sext <4 x i32> [[BT]] to <4 x i64>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i64> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i64> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[ABB1:%.*]] = trunc <4 x i64> [[ABB]] to <4 x i32>
-; CHECK-NEXT:    [[ABT1:%.*]] = trunc <4 x i64> [[ABT]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i32>
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -605,17 +596,9 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
 
 define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 ; CHECK-LABEL: @zext(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i16> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i16> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[BB1:%.*]] = zext <4 x i16> [[BB]] to <4 x i32>
-; CHECK-NEXT:    [[BT1:%.*]] = zext <4 x i16> [[BT]] to <4 x i32>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[B:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -636,17 +619,9 @@ define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 
 define void @sext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 ; CHECK-LABEL: @sext(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = sext <4 x i16> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[AT1:%.*]] = sext <4 x i16> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[BB1:%.*]] = sext <4 x i16> [[BB]] to <4 x i32>
-; CHECK-NEXT:    [[BT1:%.*]] = sext <4 x i16> [[BT]] to <4 x i32>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[B:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -705,11 +680,7 @@ define void @zext_types(<8 x i16> %a, <8 x i32> %b, ptr %p) {
 
 define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i64> [[A]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[ABB1:%.*]] = trunc <4 x i64> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[ABT1:%.*]] = trunc <4 x i64> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i64> [[A:%.*]] to <8 x i32>
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;

From 850f30c3ba378321538233b3cfbd93ae2efef77f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 29 May 2024 09:08:32 +0100
Subject: [PATCH 075/230] [ARM][MVE] Don't allow tail-predication with else
 predicates

The test case contains a vpt block with an else predicated instruction. This
might not be very unrealistic, but currently crashes due to not being able to
handle the else. The instruction would need to be removed. This patch adds some
extra checks that none of the instructions in vpt block is else predicated,
leaving it using vctp.
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   |  11 +-
 .../CodeGen/Thumb2/mve-tailpred-vptblock.ll   | 197 ++++++++++++++++++
 2 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a3144109b72040..a46c383115e2d6 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -251,6 +251,9 @@ namespace {
       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];
       if (Exclusive && Predicates.size() != 1)
         return false;
+      // We do not know how to convert an else predicate of a VCTP.
+      if (getVPTInstrPredicate(*MI) == ARMVCC::Else)
+        return false;
       return llvm::any_of(Predicates, isVCTP);
     }
 
@@ -305,8 +308,12 @@ namespace {
       // isn't predicated on entry, check whether the vctp is within the block
       // and that all other instructions are then predicated on it.
       for (auto &Block : Blocks) {
-        if (isEntryPredicatedOnVCTP(Block, false) ||
-            hasImplicitlyValidVPT(Block, RDA))
+        if (isEntryPredicatedOnVCTP(Block, false) &&
+            !any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {
+              return getVPTInstrPredicate(*MI) == ARMVCC::Else;
+            }))
+          continue;
+        if (hasImplicitlyValidVPT(Block, RDA))
           continue;
 
         SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
new file mode 100644
index 00000000000000..f9b3757bb6d2ce
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
+
+; This loop has a vpt block that should not block tailpredication
+define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %pwLineMask, ptr %ptCopySize, i8 zeroext %chColour, i8 zeroext %chOpacity) {
+; CHECK-LABEL: convert_vptblock:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrsh.w r12, [r3, #2]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    ldrsh.w r10, [r3]
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    ldrd r4, r5, [sp, #88]
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    cmp.w r10, #8
+; CHECK-NEXT:    mov.w r0, #1
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge r3, #8
+; CHECK-NEXT:    vidup.u16 q0, r8, #4
+; CHECK-NEXT:    sub.w r3, r10, r3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    adds r3, #7
+; CHECK-NEXT:    vmov.i16 q2, #0x100
+; CHECK-NEXT:    vmov.i16 q3, #0xff
+; CHECK-NEXT:    add.w r9, r0, r3, lsr #3
+; CHECK-NEXT:  .LBB0_2: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    dls lr, r9
+; CHECK-NEXT:  .LBB0_3: @ %do.body
+; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vctp.16 r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q5, [r2, q4]
+; CHECK-NEXT:    vmul.i16 q4, q5, r5
+; CHECK-NEXT:    vshr.u16 q4, q4, #8
+; CHECK-NEXT:    vsub.i16 q5, q2, q4
+; CHECK-NEXT:    vpt.i16 eq, q4, q3
+; CHECK-NEXT:    vmovt q5, q1
+; CHECK-NEXT:    vctp.16 r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q6, [r0]
+; CHECK-NEXT:    vsub.i16 q4, q2, q5
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmul.i16 q5, q5, q6
+; CHECK-NEXT:    vmla.i16 q5, q4, r4
+; CHECK-NEXT:    vshr.u16 q4, q5, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q4, [r0], #8
+; CHECK-NEXT:    vidup.u16 q4, r6, #4
+; CHECK-NEXT:    le lr, .LBB0_3
+; CHECK-NEXT:  @ %bb.4: @ %do.end
+; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    add.w r0, r11, #1
+; CHECK-NEXT:    add r7, r1
+; CHECK-NEXT:    sxth.w r11, r0
+; CHECK-NEXT:    cmp r11, r12
+; CHECK-NEXT:    blt .LBB0_2
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    bx lr
+entry:
+  %iHeight1 = getelementptr inbounds i8, ptr %ptCopySize, i32 2
+  %0 = load i16, ptr %iHeight1, align 2
+  %cmp28 = icmp sgt i16 %0, 0
+  br i1 %cmp28, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i16, ptr %ptCopySize, align 2
+  %conv5 = sext i16 %1 to i32
+  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 0, i32 4)
+  %conv6 = zext i8 %chOpacity to i16
+  %.splatinsert = insertelement <8 x i16> poison, i16 %conv6, i64 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %conv7 = zext i8 %chColour to i16
+  %.splatinsert.i = insertelement <8 x i16> poison, i16 %conv7, i64 0
+  %.splat.i = shufflevector <8 x i16> %.splatinsert.i, <8 x i16> poison, <8 x i32> zeroinitializer
+  %conv11 = sext i16 %iTargetStride to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %do.end, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %do.end
+  %pchTarget.addr.030 = phi ptr [ %pchTarget, %for.body.lr.ph ], [ %add.ptr12, %do.end ]
+  %y.029 = phi i16 [ 0, %for.body.lr.ph ], [ %inc, %do.end ]
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %for.body
+  %blkCnt.0 = phi i32 [ %conv5, %for.body ], [ %sub8, %do.body ]
+  %.pn = phi { <8 x i16>, i32 } [ %2, %for.body ], [ %13, %do.body ]
+  %pchTargetLine.0 = phi ptr [ %pchTarget.addr.030, %for.body ], [ %add.ptr, %do.body ]
+  %vStride4Offs.0 = extractvalue { <8 x i16>, i32 } %.pn, 0
+  %incr.0 = extractvalue { <8 x i16>, i32 } %.pn, 1
+  %3 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
+  %4 = tail call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %pwLineMask, <8 x i16> %vStride4Offs.0, i32 8, i32 0, i32 1, <8 x i1> %3)
+  %5 = mul <8 x i16> %4, %.splat
+  %shr = lshr <8 x i16> %5, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %6 = icmp eq <8 x i16> %shr, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %7 = sub nuw nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %shr
+  %sub = select <8 x i1> %6, <8 x i16> zeroinitializer, <8 x i16> %7
+  %8 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %pchTargetLine.0, i32 1, <8 x i1> %3, <8 x i8> zeroinitializer)
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  %sub.i = sub nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %sub
+  %10 = mul <8 x i16> %sub.i, %.splat.i
+  %11 = mul <8 x i16> %sub, %9
+  %add.i = add <8 x i16> %10, %11
+  %shr.i = lshr <8 x i16> %add.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %12 = trunc nuw <8 x i16> %shr.i to <8 x i8>
+  tail call void @llvm.masked.store.v8i8.p0(<8 x i8> %12, ptr %pchTargetLine.0, i32 1, <8 x i1> %3)
+  %13 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %incr.0, i32 4)
+  %add.ptr = getelementptr inbounds i8, ptr %pchTargetLine.0, i32 8
+  %sub8 = add nsw i32 %blkCnt.0, -8
+  %cmp9 = icmp sgt i32 %blkCnt.0, 8
+  br i1 %cmp9, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %add.ptr12 = getelementptr inbounds i8, ptr %pchTarget.addr.030, i32 %conv11
+  %inc = add nuw nsw i16 %y.029, 1
+  %cmp = icmp slt i16 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; This loop has an else predicate on the vqshl, which is not very realistic but
+; prevents us from converting to a vptblock without being able to remove it.
+define i32 @else(ptr %s1, ptr %s2, i32 %x, ptr %d, i32 %n) {
+; CHECK-LABEL: else:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    ldr r2, [sp, #8]
+; CHECK-NEXT:    cmp r2, #4
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge r3, #4
+; CHECK-NEXT:    subs r3, r2, r3
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
+; CHECK-NEXT:    movs r3, #98
+; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:  .LBB1_1: @ %do.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vpstet
+; CHECK-NEXT:    vqdmlsdht.s32 q2, q1, q0
+; CHECK-NEXT:    vqshle.u32 q2, r3
+; CHECK-NEXT:    vstrwt.32 q2, [r0], #16
+; CHECK-NEXT:    le lr, .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %do.end
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
+  %s2.addr.0 = phi ptr [ %s2, %entry ], [ %add.ptr1, %do.body ]
+  %s1.addr.0 = phi ptr [ %s1, %entry ], [ %add.ptr, %do.body ]
+  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
+  %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s2.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+  %3 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %2, <4 x i32> %1, i32 0, i32 0, i32 1, <4 x i1> %0)
+  %4 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
+  %5 = tail call <4 x i32> @llvm.arm.mve.vshl.scalar.predicated.v4i32.v4i1(<4 x i32> %3, i32 98, i32 1, i32 0, i32 1, <4 x i1> %4)
+  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %s1.addr.0, i32 4, <4 x i1> %0)
+  %add.ptr = getelementptr inbounds i8, ptr %s1.addr.0, i32 16
+  %add.ptr1 = getelementptr inbounds i8, ptr %s2.addr.0, i32 16
+  %sub = add nsw i32 %n.addr.0, -4
+  %cmp = icmp sgt i32 %n.addr.0, 4
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret i32 0
+}

From 5aba0ded6c0415bc267a80469c8ea3661e012dc6 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 29 May 2024 10:18:22 +0200
Subject: [PATCH 076/230] [flang] lower assumed-rank variables specification
 expressions (#93477)

Enable lowering of assumed-ranks in specification parts under a debug
flag. I am using a debug flag because many cryptic TODOs/issues may be
hit until more support is added. The development should not take too
long, so I want to stay away from the noise of adding an actual
experimental flag to flang-new.
---
 flang/lib/Lower/ConvertVariable.cpp           | 33 +++++++--
 .../HLFIR/convert-variable-assumed-rank.f90   | 70 +++++++++++++++++++
 2 files changed, 98 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90

diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 075d0634fd1eee..8e9c1d640c3302 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -41,9 +41,15 @@
 #include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/runtime-type-info.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
+static llvm::cl::opt<bool> allowAssumedRank(
+    "allow-assumed-rank",
+    llvm::cl::desc("Enable assumed rank lowering - experimental"),
+    llvm::cl::init(false));
+
 #define DEBUG_TYPE "flang-lower-variable"
 
 /// Helper to lower a scalar expression using a specific symbol mapping.
@@ -1885,7 +1891,8 @@ void Fortran::lower::mapSymbolAttributes(
     return;
   }
 
-  if (Fortran::evaluate::IsAssumedRank(sym))
+  const bool isAssumedRank = Fortran::evaluate::IsAssumedRank(sym);
+  if (isAssumedRank && !allowAssumedRank)
     TODO(loc, "assumed-rank variable in procedure implemented in Fortran");
 
   Fortran::lower::BoxAnalyzer ba;
@@ -1894,6 +1901,8 @@ void Fortran::lower::mapSymbolAttributes(
   // First deal with pointers and allocatables, because their handling here
   // is the same regardless of their rank.
   if (Fortran::semantics::IsAllocatableOrPointer(sym)) {
+    if (isAssumedRank)
+      TODO(loc, "assumed-rank pointer or allocatable");
     // Get address of fir.box describing the entity.
     // global
     mlir::Value boxAlloc = preAlloc;
@@ -1942,7 +1951,7 @@ void Fortran::lower::mapSymbolAttributes(
         if (mlir::Value len =
                 lowerExplicitCharLen(converter, loc, ba, symMap, stmtCtx))
           explicitParams.push_back(len);
-        if (sym.Rank() == 0) {
+        if (!isAssumedRank && sym.Rank() == 0) {
           // Do not keep scalar characters as fir.box (even when optional).
           // Lowering and FIR is not meant to deal with scalar characters as
           // fir.box outside of calls.
@@ -1987,9 +1996,11 @@ void Fortran::lower::mapSymbolAttributes(
         }
       }
       // TODO: derived type length parameters.
-      lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx);
-      lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents, symMap,
-                           stmtCtx);
+      if (!isAssumedRank) {
+        lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx);
+        lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents,
+                             symMap, stmtCtx);
+      }
       genBoxDeclare(converter, symMap, sym, dummyArg, lbounds, explicitParams,
                     explicitExtents, replace);
       return;
@@ -2021,6 +2032,11 @@ void Fortran::lower::mapSymbolAttributes(
     if (isUnusedEntryDummy) {
       assert(!Fortran::semantics::IsAllocatableOrPointer(sym) &&
              "handled above");
+      // Need to add support for allocatable assumed-rank to use
+      // logic below, or to simplify it and add codegen for fir.zero
+      // !fir.box<> instead.
+      if (isAssumedRank)
+        TODO(loc, "assumed rank in ENTRY");
       // The box is read right away because lowering code does not expect
       // a non pointer/allocatable symbol to be mapped to a MutableBox.
       mlir::Type ty = converter.genType(var);
@@ -2042,6 +2058,13 @@ void Fortran::lower::mapSymbolAttributes(
     return false;
   };
 
+  if (isAssumedRank) {
+    assert(isUnusedEntryDummy && "assumed rank must be pointers/allocatables "
+                                 "or descriptor dummy arguments");
+    genUnusedEntryPointBox();
+    return;
+  }
+
   // Helper to generate scalars for the symbol properties.
   auto genValue = [&](const Fortran::lower::SomeExpr &expr) {
     return genScalarValue(converter, loc, expr, symMap, stmtCtx);
diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
new file mode 100644
index 00000000000000..748c15be84496c
--- /dev/null
+++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
@@ -0,0 +1,70 @@
+! Test lowering of assumed-rank variables
+! RUN: bbc -emit-hlfir %s -allow-assumed-rank -o - | FileCheck %s
+
+module assumed_rank_tests
+interface
+subroutine takes_real(x)
+  real :: x(..)
+end subroutine
+subroutine takes_char(x)
+  character(*) :: x(..)
+end subroutine
+end interface
+contains
+
+subroutine test_intrinsic(x)
+  real :: x(..)
+  call takes_real(x)
+end subroutine
+
+subroutine test_character_explicit_len(x, n)
+  integer(8) :: n
+  character(n) :: x(..)
+  call takes_char(x)
+end subroutine
+
+subroutine test_character_assumed_len(x)
+  character(*) :: x(..)
+  call takes_char(x)
+end subroutine
+
+subroutine test_with_attrs(x)
+  real, target, optional :: x(..)
+  call takes_real(x)
+end subroutine
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_intrinsic(
+! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_character_explicit_len(
+! CHECK-SAME:                                                                 %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"},
+! CHECK-SAME:                                                                 %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           fir.call @_QPtakes_char(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_character_assumed_len(
+! CHECK-SAME:                                                                %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           fir.call @_QPtakes_char(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_with_attrs(
+! CHECK-SAME:                                                     %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.optional, fir.target}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
+end module

From 326f58d7d68c33cfbb6ad54123ab9b56114de502 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 29 May 2024 10:19:07 +0200
Subject: [PATCH 077/230] [flang][HLFIR] lower hlfir.declare of assumed-ranks
 (#93468)

hlfir.declare is in charge of ensuring that the lower bounds of its
"hlfir entity" output are the ones of the source program. For
non-allocatable/non-pointer assumed-ranks where the input descriptor
lower bounds may not be ones, the hlfir.declare needs to be lowered to
an hlfir.rebox_assumed_rank to set the lower bounds to ones.
---
 .../Optimizer/HLFIR/Transforms/ConvertToFIR.cpp | 17 +++++++++++------
 flang/test/HLFIR/declare-codegen.fir            |  9 +++++++++
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index b8823bfa59f8f2..b48b993ddc5aff 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -348,7 +348,17 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
       // Helper to generate the hlfir fir.box with the local lower bounds and
       // type parameters.
       auto genHlfirBox = [&]() -> mlir::Value {
-        if (!mlir::isa<fir::BaseBoxType>(firBase.getType())) {
+        if (auto baseBoxType =
+                mlir::dyn_cast<fir::BaseBoxType>(firBase.getType())) {
+          // Rebox so that lower bounds are correct.
+          if (baseBoxType.isAssumedRank())
+            return builder.create<fir::ReboxAssumedRankOp>(
+                loc, hlfirBaseType, firBase,
+                fir::LowerBoundModifierAttribute::SetToOnes);
+          return builder.create<fir::ReboxOp>(loc, hlfirBaseType, firBase,
+                                              declareOp.getShape(),
+                                              /*slice=*/mlir::Value{});
+        } else {
           llvm::SmallVector<mlir::Value> typeParams;
           auto maybeCharType = mlir::dyn_cast<fir::CharacterType>(
               fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType)));
@@ -358,11 +368,6 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
           return builder.create<fir::EmboxOp>(
               loc, hlfirBaseType, firBase, declareOp.getShape(),
               /*slice=*/mlir::Value{}, typeParams);
-        } else {
-          // Rebox so that lower bounds are correct.
-          return builder.create<fir::ReboxOp>(loc, hlfirBaseType, firBase,
-                                              declareOp.getShape(),
-                                              /*slice=*/mlir::Value{});
         }
       };
       if (!mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation())
diff --git a/flang/test/HLFIR/declare-codegen.fir b/flang/test/HLFIR/declare-codegen.fir
index 9f51d0fbc7afd7..bd0d61a2559dbd 100644
--- a/flang/test/HLFIR/declare-codegen.fir
+++ b/flang/test/HLFIR/declare-codegen.fir
@@ -210,3 +210,12 @@ func.func @dummy_scope(%arg0: !fir.ref<f32>) {
 // CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>) {
 // CHECK:         %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:         %[[VAL_1:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[SCOPE]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+
+func.func @assumed_rank_declare(%arg0: !fir.box<!fir.array<*:f32>>) {
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+  return
+}
+// CHECK-LABEL:  func.func @assumed_rank_declare(
+// CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>>) {
+// CHECK:    %[[VAL_1:.*]] = fir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
+// CHECK:    %[[VAL_2:.*]] = fir.rebox_assumed_rank %[[VAL_1]] lbs ones : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>

From 6957c00a8ccd36d990ebeb3b672621ba237bd9d8 Mon Sep 17 00:00:00 2001
From: Alastair Houghton <ahoughton@apple.com>
Date: Wed, 29 May 2024 09:27:30 +0100
Subject: [PATCH 078/230] [RuntimeDyld][ELF][AArch64] Fix
 resolveAArch64ShortBranch. (#92245)

We don't know the load addresses when this function is called, so it
shouldn't be trying to use them to determine whether or not the branch
is short. Notably, this will fail in the case where the code is being
loaded into a target in such a way that the section offsets differ
between the process generating the code and the target process.

rdar://127673408
---
 .../RuntimeDyld/RuntimeDyldELF.cpp            | 30 ++++++++++++-------
 .../AArch64/ELF_ARM64_xsec_branch.s           | 20 +++++++++++++
 2 files changed, 40 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s

diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index eaf8c35142defe..0046220611203c 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1129,7 +1129,8 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
 bool RuntimeDyldELF::resolveAArch64ShortBranch(
     unsigned SectionID, relocation_iterator RelI,
     const RelocationValueRef &Value) {
-  uint64_t Address;
+  uint64_t TargetOffset;
+  unsigned TargetSectionID;
   if (Value.SymbolName) {
     auto Loc = GlobalSymbolTable.find(Value.SymbolName);
 
@@ -1138,23 +1139,32 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch(
       return false;
 
     const auto &SymInfo = Loc->second;
-    Address =
-        uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
-            SymInfo.getOffset()));
+
+    TargetSectionID = SymInfo.getSectionID();
+    TargetOffset = SymInfo.getOffset();
   } else {
-    Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+    TargetSectionID = Value.SectionID;
+    TargetOffset = 0;
   }
-  uint64_t Offset = RelI->getOffset();
-  uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
+
+  // We don't actually know the load addresses at this point, so if the
+  // branch is cross-section, we don't know exactly how far away it is.
+  if (TargetSectionID != SectionID)
+    return false;
+
+  uint64_t SourceOffset = RelI->getOffset();
 
   // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27
   // If distance between source and target is out of range then we should
   // create thunk.
-  if (!isInt<28>(Address + Value.Addend - SourceAddress))
+  if (!isInt<28>(TargetOffset + Value.Addend - SourceOffset))
     return false;
 
-  resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(),
-                    Value.Addend);
+  RelocationEntry RE(SectionID, SourceOffset, RelI->getType(), Value.Addend);
+  if (Value.SymbolName)
+    addRelocationForSymbol(RE, Value.SymbolName);
+  else
+    addRelocationForSection(RE, Value.SectionID);
 
   return true;
 }
diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s
new file mode 100644
index 00000000000000..fd04f569526b9f
--- /dev/null
+++ b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -check=%s %t
+
+.globl _main
+.weak _label1
+
+.section .text.label1,"ax"
+_label1:
+        nop
+
+.section .text.main,"ax"
+_main:
+        b _label1
+
+# Branch must be to stub in .text.main, *not* back to _label1, because
+# in general sections could be loaded at arbitrary addresses in target memory,
+# and when initially processing locations and generating stubs we don't know
+# the final layout yet, so we can't tell if the branch offset is within range.
+
+# rtdyld-check: *{4}(_main) = 0x14000001

From 4ad2f415f6e30ceb116466bf81515d3765402a0f Mon Sep 17 00:00:00 2001
From: AnastasiyaChernikova <anastasiya.chernikova@syntacore.com>
Date: Wed, 29 May 2024 11:28:00 +0300
Subject: [PATCH 079/230] [Exegesis] Changing non-standard CHECK in tests to
 more compliant way (#93222)

Fixed some FileChecks in tests. Firstly found in PR89047
(https://github.com/llvm/llvm-project/pull/89047#discussion_r1608909489)
---
 .../test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s | 2 +-
 llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s            | 2 +-
 llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s   | 2 +-
 .../test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s | 2 +-
 llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s  | 2 +-
 llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s    | 2 +-
 llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s    | 2 +-
 llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s   | 2 +-
 .../tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s  | 2 +-
 llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test    | 4 ++--
 llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s               | 2 +-
 .../X86/uops/uops-CMOV16rm-noreg-serialization.s              | 2 +-
 12 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
index 653f544e36ce26..1db28a84e2ff62 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
@@ -10,4 +10,4 @@ CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG2]]=0x0'
 # We don't check REG3 because in the case that REG2=REG3 the check would fail
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
index f9b4860c3f4a09..cc2cf20ce05f46 100644
--- a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
+++ b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     AND64
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+_64]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
index f3853eaa62ea7d..dcbbd3cf7fc355 100644
--- a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     ADD
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
index 3d457aeb59276a..c4d9fcf2e0613a 100644
--- a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
@@ -8,4 +8,4 @@ CHECK-NEXT:     ADD8
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
index 9cdd9bf029d023..384f9f1d8cf9e8 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
@@ -8,4 +8,4 @@ CHECK-NEXT: key:
 CHECK-NEXT:   instructions:
 CHECK-NEXT:     'CMOV32rr {{.*}} i_0x{{[0-9a-f]}}'
 CHECK-NEXT: config: ''
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
index 8b4f42dd320153..c82f5c884b9928 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
@@ -12,4 +12,4 @@ CHECK-NEXT:     - {{.*}}
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
index c20e687cf20d21..26c4391bc99d6b 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     SBB8rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
index 7e67a4343f4e68..bf97a40c4bf0da 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
@@ -10,4 +10,4 @@ CHECK-NEXT:     SQRTSSr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-NOT: crashed
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
index 4fee6fe927097a..08beccfe7704f4 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     ADD32rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
index 382e742144ac45..f27101d8966080 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
@@ -9,7 +9,7 @@ CHECK-NEXT:     SBB8rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
 
 CHECK1-NOT: SBB8rr
 
@@ -21,4 +21,4 @@ CHECK2-NEXT:     SBB8rr
 CHECK2-NEXT: config: ''
 CHECK2-NEXT: register_initial_values:
 CHECK2-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK2-LAST: ...
+CHECK2-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
index af1662d93a7440..2a8cc8e34450ad 100644
--- a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
+++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
@@ -16,4 +16,4 @@ CHECK-NEXT: {{.*}}
 CHECK-NEXT: num_repetitions: 10000
 CHECK-NEXT: measurements:
 CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}}
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
index 302c2b0ee722b0..1e673e806da212 100644
--- a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
+++ b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
@@ -8,4 +8,4 @@ CHECK-YAML-NEXT: mode:            uops
 CHECK-YAML-NEXT: key:
 CHECK-YAML-NEXT:   instructions:
 CHECK-YAML-NEXT:     - 'CMOV16rm {{[A-Z0-9]+}} {{[A-Z0-9]+}} {{[A-Z0-9]+}} i_0x1 %noreg i_0x0 %noreg i_0x{{[0-9a-f]}}'
-CHECK-YAML-LAST: ...
+CHECK-YAML-DAG: ...

From 93d8d74ae6717c8e7c8b25ad5a6cfa212d3a4d37 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 29 May 2024 09:36:53 +0100
Subject: [PATCH 080/230] [VectorCombine] Remove requirement for Instructions
 in shuffleToIdentity (#93543)

This removes the check that both operands of the original shuffle are
instructions, which is a relic from a previous version that held more
variables as Instructions.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  3 +-
 .../AArch64/shuffletoidentity.ll              | 29 +++++++------------
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c3c4ee8479766e..7ecfe5218ef67c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1760,8 +1760,7 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
 // do so.
 bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
-  if (!Ty || !isa<Instruction>(I.getOperand(0)) ||
-      !isa<Instruction>(I.getOperand(1)))
+  if (!Ty)
     return false;
 
   SmallVector<InstLane> Start(Ty->getNumElements());
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 62fb0e6c7c11d9..c2e9be56889678 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -15,9 +15,7 @@ define <8 x i8> @trivial(<8 x i8> %a) {
 
 define <4 x i32> @add_same_operands(<4 x i32> %x) {
 ; CHECK-LABEL: @add_same_operands(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
 ; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -364,8 +362,7 @@ define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 define <4 x i32> @extrause_add_same_operands(<4 x i32> %x) {
 ; CHECK-LABEL: @extrause_add_same_operands(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = add <4 x i32> [[X]], [[X]]
 ; CHECK-NEXT:    [[ADD2:%.*]] = add <4 x i32> [[SHUF]], [[REVSHUF]]
 ; CHECK-NEXT:    ret <4 x i32> [[ADD2]]
 ;
@@ -513,9 +510,7 @@ define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 
 define <4 x i64> @single_zext(<4 x i32> %x) {
 ; CHECK-LABEL: @single_zext(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -695,10 +690,8 @@ define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) {
 
 define <4 x i64> @zext_chain(<4 x i16> %x) {
 ; CHECK-LABEL: @zext_chain(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[X:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[SHUF]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i32> [[ZEXT]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[SEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -899,13 +892,11 @@ entry:
 
 define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-LABEL: @singleop(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[A:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[A2:%.*]] = zext <4 x i8> [[A1]] to <4 x i16>
-; CHECK-NEXT:    [[B2:%.*]] = zext <4 x i8> [[B1]] to <4 x i16>
-; CHECK-NEXT:    [[AB:%.*]] = add <4 x i16> [[A2]], [[B2]]
-; CHECK-NEXT:    [[T:%.*]] = trunc <4 x i16> [[AB]] to <4 x i8>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[T]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[A:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = trunc <4 x i16> [[TMP4]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
   %a1 = shufflevector <4 x i8> %a, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>

From fa649df8e54c2aa8921a42ad8d10e1e45700e5d7 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Wed, 29 May 2024 09:47:23 +0100
Subject: [PATCH 081/230] [clang][ExtractAPI] Flatten all enum cases from
 anonymous enums at top level (#93559)

rdar://128863241
---
 .../clang/ExtractAPI/ExtractAPIVisitor.h      |  65 +++++-----
 .../ExtractAPI/anonymous_record_no_typedef.c  |  42 ++-----
 clang/test/ExtractAPI/enum.c                  | 112 ------------------
 clang/tools/libclang/CXExtractAPI.cpp         |   3 +
 4 files changed, 54 insertions(+), 168 deletions(-)

diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index 8ccebe457ed530..76d7fd798bed3a 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -21,6 +21,7 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ParentMapContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/LLVM.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
@@ -127,7 +128,7 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
 protected:
   /// Collect API information for the enum constants and associate with the
   /// parent enum.
-  void recordEnumConstants(EnumRecord *EnumRecord,
+  void recordEnumConstants(SymbolReference Container,
                            const EnumDecl::enumerator_range Constants);
 
   /// Collect API information for the Objective-C methods and associate with the
@@ -248,12 +249,8 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
     clang::index::generateUSRForDecl(Tag, TagUSR);
     if (auto *Record = llvm::dyn_cast_if_present<TagRecord>(
             API.findRecordForUSR(TagUSR))) {
-      if (Record->IsEmbeddedInVarDeclarator) {
+      if (Record->IsEmbeddedInVarDeclarator)
         NewRecordContext->stealRecordChain(*Record);
-        auto *NewRecord = cast<APIRecord>(NewRecordContext);
-        if (NewRecord->Comment.empty())
-          NewRecord->Comment = Record->Comment;
-      }
     }
   }
 };
@@ -394,17 +391,6 @@ bool ExtractAPIVisitorBase<Derived>::VisitEnumDecl(const EnumDecl *Decl) {
   if (!getDerivedExtractAPIVisitor().shouldDeclBeIncluded(Decl))
     return true;
 
-  SmallString<128> QualifiedNameBuffer;
-  // Collect symbol information.
-  StringRef Name = Decl->getName();
-  if (Name.empty())
-    Name = getTypedefName(Decl);
-  if (Name.empty()) {
-    llvm::raw_svector_ostream OS(QualifiedNameBuffer);
-    Decl->printQualifiedName(OS);
-    Name = QualifiedNameBuffer;
-  }
-
   SmallString<128> USR;
   index::generateUSRForDecl(Decl, USR);
   PresumedLoc Loc =
@@ -420,13 +406,29 @@ bool ExtractAPIVisitorBase<Derived>::VisitEnumDecl(const EnumDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForEnum(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-  auto *ER = API.createRecord<EnumRecord>(
-      USR, Name, createHierarchyInformationForDecl(*Decl), Loc,
-      AvailabilityInfo::createFromDecl(Decl), Comment, Declaration, SubHeading,
-      isInSystemHeader(Decl), isEmbeddedInVarDeclarator(*Decl));
+
+  // Collect symbol information.
+  SymbolReference ParentContainer;
+
+  if (Decl->hasNameForLinkage()) {
+    StringRef Name = Decl->getName();
+    if (Name.empty())
+      Name = getTypedefName(Decl);
+
+    auto *ER = API.createRecord<EnumRecord>(
+        USR, Name, createHierarchyInformationForDecl(*Decl), Loc,
+        AvailabilityInfo::createFromDecl(Decl), Comment, Declaration,
+        SubHeading, isInSystemHeader(Decl), false);
+    ParentContainer = SymbolReference(ER);
+  } else {
+    // If this an anonymous enum then the parent scope of the constants is the
+    // top level namespace.
+    ParentContainer = {};
+  }
 
   // Now collect information about the enumerators in this enum.
-  getDerivedExtractAPIVisitor().recordEnumConstants(ER, Decl->enumerators());
+  getDerivedExtractAPIVisitor().recordEnumConstants(ParentContainer,
+                                                    Decl->enumerators());
 
   return true;
 }
@@ -1197,7 +1199,7 @@ bool ExtractAPIVisitorBase<Derived>::VisitObjCCategoryDecl(
 /// parent enum.
 template <typename Derived>
 void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
-    EnumRecord *EnumRecord, const EnumDecl::enumerator_range Constants) {
+    SymbolReference Container, const EnumDecl::enumerator_range Constants) {
   for (const auto *Constant : Constants) {
     // Collect symbol information.
     StringRef Name = Constant->getName();
@@ -1218,9 +1220,8 @@ void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
         DeclarationFragmentsBuilder::getSubHeading(Constant);
 
     API.createRecord<EnumConstantRecord>(
-        USR, Name, createHierarchyInformationForDecl(*Constant), Loc,
-        AvailabilityInfo::createFromDecl(Constant), Comment, Declaration,
-        SubHeading, isInSystemHeader(Constant));
+        USR, Name, Container, Loc, AvailabilityInfo::createFromDecl(Constant),
+        Comment, Declaration, SubHeading, isInSystemHeader(Constant));
   }
 }
 
@@ -1469,7 +1470,17 @@ class ExtractAPIVisitor
 
   bool shouldDeclBeIncluded(const Decl *D) const { return true; }
   const RawComment *fetchRawCommentForDecl(const Decl *D) const {
-    return this->Context.getRawCommentForDeclNoCache(D);
+    if (const auto *Comment = this->Context.getRawCommentForDeclNoCache(D))
+      return Comment;
+
+    if (const auto *Declarator = dyn_cast<DeclaratorDecl>(D)) {
+      const auto *TagTypeDecl = Declarator->getType()->getAsTagDecl();
+      if (TagTypeDecl && TagTypeDecl->isEmbeddedInDeclarator() &&
+          TagTypeDecl->isCompleteDefinition())
+        return this->Context.getRawCommentForDeclNoCache(TagTypeDecl);
+    }
+
+    return nullptr;
   }
 };
 
diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
index 71e460afb12833..789316ca8930b8 100644
--- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c
+++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
@@ -84,21 +84,15 @@ struct Vehicle {
     // TYPE: "text": "The type of vehicle."
     // TYPE: "title": "type"
 
-    // BICYCLE: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle $ c:@S@Vehicle@FI@type"
     // BICYCLE-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle"
     // BICYCLE: "title": "Bicycle"
     // BICYCLE:      "pathComponents": [
-    // BICYCLE-NEXT:   "Vehicle",
-    // BICYCLE-NEXT:   "type",
     // BICYCLE-NEXT:   "Bicycle"
     // BICYCLE-NEXT: ]
 
-    // CAR: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car $ c:@S@Vehicle@FI@type"
     // CAR-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car"
     // CAR: "title": "Car"
     // CAR:      "pathComponents": [
-    // CAR-NEXT:   "Vehicle",
-    // CAR-NEXT:   "type",
     // CAR-NEXT:   "Car"
     // CAR-NEXT: ]
 
@@ -151,32 +145,22 @@ struct Vehicle {
     // NAME-NEXT: ]
 };
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALENUM
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE
 enum {
   GlobalCase,
   GlobalOtherCase
 };
-// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalCase $ c:@Ea@GlobalCase"
-// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalOtherCase $ c:@Ea@GlobalCase"
-// GLOBALENUM-LABEL: "!testLabel": "c:@Ea@GlobalCase"
-// GLOBALENUM:      "declarationFragments": [
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "keyword",
-// GLOBALENUM-NEXT:     "spelling": "enum"
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "text",
-// GLOBALENUM-NEXT:     "spelling": " : "
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "typeIdentifier",
-// GLOBALENUM-NEXT:     "preciseIdentifier": "c:i",
-// GLOBALENUM-NEXT:     "spelling": "unsigned int"
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "text",
-// GLOBALENUM-NEXT:     "spelling": " { ... };"
-// GLOBALENUM-NEXT:   }
-// GLOBALENUM-NEXT: ]
+// GLOBALCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalCase"
+// GLOBALCASE: "title": "GlobalCase"
+// GLOBALCASE:      "pathComponents": [
+// GLOBALCASE-NEXT:   "GlobalCase"
+// GLOBALCASE-NEXT: ]
+
+// GLOBALOTHERCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalOtherCase"
+// GLOBALOTHERCASE: "title": "GlobalOtherCase"
+// GLOBALOTHERCASE:      "pathComponents": [
+// GLOBALOTHERCASE-NEXT:   "GlobalOtherCase"
+// GLOBALOTHERCASE-NEXT: ]
 
 // expected-no-diagnostics
diff --git a/clang/test/ExtractAPI/enum.c b/clang/test/ExtractAPI/enum.c
index 67e003834a7d58..58170aa0e1d906 100644
--- a/clang/test/ExtractAPI/enum.c
+++ b/clang/test/ExtractAPI/enum.c
@@ -115,18 +115,6 @@ enum {
       "source": "c:@E@Direction@West",
       "target": "c:@E@Direction",
       "targetFallback": "Direction"
-    },
-    {
-      "kind": "memberOf",
-      "source": "c:@Ea@Constant@Constant",
-      "target": "c:@Ea@Constant",
-      "targetFallback": "enum (unnamed)"
-    },
-    {
-      "kind": "memberOf",
-      "source": "c:@Ea@OtherConstant@OtherConstant",
-      "target": "c:@Ea@OtherConstant",
-      "targetFallback": "enum (unnamed)"
     }
   ],
   "symbols": [
@@ -677,55 +665,6 @@ enum {
         "West"
       ]
     },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "enum"
-        },
-        {
-          "kind": "text",
-          "spelling": " : "
-        },
-        {
-          "kind": "typeIdentifier",
-          "preciseIdentifier": "c:i",
-          "spelling": "unsigned int"
-        },
-        {
-          "kind": "text",
-          "spelling": " { ... };"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:@Ea@Constant"
-      },
-      "kind": {
-        "displayName": "Enumeration",
-        "identifier": "c.enum"
-      },
-      "location": {
-        "position": {
-          "character": 0,
-          "line": 16
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "enum (unnamed)"
-          }
-        ],
-        "title": "enum (unnamed)"
-      },
-      "pathComponents": [
-        "enum (unnamed)"
-      ]
-    },
     {
       "accessLevel": "public",
       "declarationFragments": [
@@ -765,59 +704,9 @@ enum {
         "title": "Constant"
       },
       "pathComponents": [
-        "enum (unnamed)",
         "Constant"
       ]
     },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "enum"
-        },
-        {
-          "kind": "text",
-          "spelling": " : "
-        },
-        {
-          "kind": "typeIdentifier",
-          "preciseIdentifier": "c:i",
-          "spelling": "unsigned int"
-        },
-        {
-          "kind": "text",
-          "spelling": " { ... };"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:@Ea@OtherConstant"
-      },
-      "kind": {
-        "displayName": "Enumeration",
-        "identifier": "c.enum"
-      },
-      "location": {
-        "position": {
-          "character": 0,
-          "line": 20
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "enum (unnamed)"
-          }
-        ],
-        "title": "enum (unnamed)"
-      },
-      "pathComponents": [
-        "enum (unnamed)"
-      ]
-    },
     {
       "accessLevel": "public",
       "declarationFragments": [
@@ -857,7 +746,6 @@ enum {
         "title": "OtherConstant"
       },
       "pathComponents": [
-        "enum (unnamed)",
         "OtherConstant"
       ]
     }
diff --git a/clang/tools/libclang/CXExtractAPI.cpp b/clang/tools/libclang/CXExtractAPI.cpp
index d74f3740406c5c..c35558e66fcb96 100644
--- a/clang/tools/libclang/CXExtractAPI.cpp
+++ b/clang/tools/libclang/CXExtractAPI.cpp
@@ -45,6 +45,9 @@ struct LibClangExtractAPIVisitor
       : ExtractAPIVisitor<LibClangExtractAPIVisitor>(Context, API) {}
 
   const RawComment *fetchRawCommentForDecl(const Decl *D) const {
+    if (const auto *Comment = Base::fetchRawCommentForDecl(D))
+      return Comment;
+
     return Context.getRawCommentForAnyRedecl(D);
   }
 

From f6ace2bc15bfde4cc9bd140859fa92618568a006 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 29 May 2024 09:51:05 +0100
Subject: [PATCH 082/230] [AArch64] Expand vector ops when NEON and SVE are
 unavailable. (#90833)

Unlike `+noneon` we must assume that vector types are available, i.e.
it is valid to pass/return vector arguments to and from functions.
However, the compiler must make sure to scalarize any vector
operations.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   79 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    6 +-
 ...streaming-mode-fixed-length-and-combine.ll |  226 +-
 ...treaming-mode-fixed-length-bit-counting.ll | 2167 +++++++-
 ...sve-streaming-mode-fixed-length-bitcast.ll |   30 +-
 ...e-streaming-mode-fixed-length-bitselect.ll |   32 +-
 .../sve-streaming-mode-fixed-length-concat.ll |  119 +-
 ...e-streaming-mode-fixed-length-ext-loads.ll |  338 +-
 ...ing-mode-fixed-length-extract-subvector.ll |   50 +-
 ...ng-mode-fixed-length-extract-vector-elt.ll |   54 +-
 ...e-streaming-mode-fixed-length-fcopysign.ll |  846 ++-
 ...ve-streaming-mode-fixed-length-fp-arith.ll | 3177 ++++++++---
 ...streaming-mode-fixed-length-fp-compares.ll | 4788 +++++++++--------
 ...-streaming-mode-fixed-length-fp-convert.ll |   29 +-
 ...aming-mode-fixed-length-fp-extend-trunc.ll |  732 ++-
 .../sve-streaming-mode-fixed-length-fp-fma.ll |  569 +-
 ...e-streaming-mode-fixed-length-fp-minmax.ll | 2040 ++++---
 ...eaming-mode-fixed-length-fp-reduce-fa64.ll |   26 +-
 ...e-streaming-mode-fixed-length-fp-reduce.ll | 1438 +++--
 ...streaming-mode-fixed-length-fp-rounding.ll | 2030 ++++++-
 ...e-streaming-mode-fixed-length-fp-select.ll |  305 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 2254 ++++++--
 ...-streaming-mode-fixed-length-fp-vselect.ll |  511 +-
 ...ing-mode-fixed-length-insert-vector-elt.ll |  367 +-
 ...e-streaming-mode-fixed-length-int-arith.ll | 2132 +++++++-
 ...treaming-mode-fixed-length-int-compares.ll | 1048 +++-
 ...sve-streaming-mode-fixed-length-int-div.ll | 2044 +++----
 ...streaming-mode-fixed-length-int-extends.ll | 3716 ++++++++++---
 ...eaming-mode-fixed-length-int-immediates.ll | 3425 +++++++++++-
 ...sve-streaming-mode-fixed-length-int-log.ll | 1503 +++++-
 ...-streaming-mode-fixed-length-int-minmax.ll | 2404 ++++++++-
 ...ing-mode-fixed-length-int-mla-neon-fa64.ll |   47 +-
 ...ve-streaming-mode-fixed-length-int-mulh.ll | 1664 +++++-
 ...-streaming-mode-fixed-length-int-reduce.ll | 1642 +++++-
 ...sve-streaming-mode-fixed-length-int-rem.ll | 2654 ++++-----
 ...-streaming-mode-fixed-length-int-select.ll |  581 +-
 ...-streaming-mode-fixed-length-int-shifts.ll | 1632 +++++-
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 1895 +++++--
 ...streaming-mode-fixed-length-int-vselect.ll |  817 ++-
 ...-streaming-mode-fixed-length-ld2-alloca.ll |  118 +-
 ...reaming-mode-fixed-length-limit-duplane.ll |  145 +-
 .../sve-streaming-mode-fixed-length-loads.ll  |   33 +-
 ...-streaming-mode-fixed-length-log-reduce.ll |  888 ++-
 ...streaming-mode-fixed-length-masked-load.ll | 3314 +++++++++---
 ...treaming-mode-fixed-length-masked-store.ll |  806 ++-
 ...eaming-mode-fixed-length-optimize-ptrue.ll |  937 +++-
 ...streaming-mode-fixed-length-permute-rev.ll |  472 +-
 ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1261 ++++-
 .../sve-streaming-mode-fixed-length-ptest.ll  |  399 +-
 .../sve-streaming-mode-fixed-length-rev.ll    |  936 +++-
 ...e-streaming-mode-fixed-length-sdiv-pow2.ll |  768 ++-
 ...sve-streaming-mode-fixed-length-shuffle.ll |   72 +-
 ...treaming-mode-fixed-length-splat-vector.ll |  245 +-
 .../sve-streaming-mode-fixed-length-stores.ll |   60 +-
 ...e-streaming-mode-fixed-length-subvector.ll |    8 +-
 ...treaming-mode-fixed-length-trunc-stores.ll |   64 +-
 .../sve-streaming-mode-fixed-length-trunc.ll  | 2789 +++++++++-
 ...eaming-mode-fixed-length-vector-shuffle.ll |  339 +-
 .../sve-streaming-mode-test-register-mov.ll   |    6 +-
 59 files changed, 49850 insertions(+), 13227 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 25ba8d8500306f..814bbe27049820 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -360,24 +360,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
-    // Someone set us up the NEON.
-    addDRTypeForNEON(MVT::v2f32);
-    addDRTypeForNEON(MVT::v8i8);
-    addDRTypeForNEON(MVT::v4i16);
-    addDRTypeForNEON(MVT::v2i32);
-    addDRTypeForNEON(MVT::v1i64);
-    addDRTypeForNEON(MVT::v1f64);
-    addDRTypeForNEON(MVT::v4f16);
-    addDRTypeForNEON(MVT::v4bf16);
-
-    addQRTypeForNEON(MVT::v4f32);
-    addQRTypeForNEON(MVT::v2f64);
-    addQRTypeForNEON(MVT::v16i8);
-    addQRTypeForNEON(MVT::v8i16);
-    addQRTypeForNEON(MVT::v4i32);
-    addQRTypeForNEON(MVT::v2i64);
-    addQRTypeForNEON(MVT::v8f16);
-    addQRTypeForNEON(MVT::v8bf16);
+
+    addDRType(MVT::v2f32);
+    addDRType(MVT::v8i8);
+    addDRType(MVT::v4i16);
+    addDRType(MVT::v2i32);
+    addDRType(MVT::v1i64);
+    addDRType(MVT::v1f64);
+    addDRType(MVT::v4f16);
+    addDRType(MVT::v4bf16);
+
+    addQRType(MVT::v4f32);
+    addQRType(MVT::v2f64);
+    addQRType(MVT::v16i8);
+    addQRType(MVT::v8i16);
+    addQRType(MVT::v4i32);
+    addQRType(MVT::v2i64);
+    addQRType(MVT::v8f16);
+    addQRType(MVT::v8bf16);
   }
 
   if (Subtarget->hasSVEorSME()) {
@@ -1125,7 +1125,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  if (Subtarget->hasNEON()) {
+  if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
     for (auto Op :
@@ -1337,6 +1337,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // FADDP custom lowering
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
+  } else /* !isNeonAvailable */ {
+    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+        setOperationAction(Op, VT, Expand);
+
+      if (VT.is128BitVector() || VT.is64BitVector()) {
+        setOperationAction(ISD::LOAD, VT, Legal);
+        setOperationAction(ISD::STORE, VT, Legal);
+        setOperationAction(ISD::BITCAST, VT,
+                           Subtarget->isLittleEndian() ? Legal : Expand);
+      }
+      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
   }
 
   if (Subtarget->hasSME()) {
@@ -2020,14 +2038,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::ZERO_EXTEND, VT, Default);
 }
 
-void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+void AArch64TargetLowering::addDRType(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
-  addTypeForNEON(VT);
+  if (Subtarget->isNeonAvailable())
+    addTypeForNEON(VT);
 }
 
-void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+void AArch64TargetLowering::addQRType(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR128RegClass);
-  addTypeForNEON(VT);
+  if (Subtarget->isNeonAvailable())
+    addTypeForNEON(VT);
 }
 
 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
@@ -9445,7 +9465,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
-  if (!Subtarget->hasNEON())
+  if (!Subtarget->isNeonAvailable() &&
+      !Subtarget->useSVEForFixedLengthVectors())
     return SDValue();
 
   EVT VT = Op.getValueType();
@@ -14141,6 +14162,13 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
 }
 
+bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
+    EVT VT, unsigned DefinedValues) const {
+  if (!Subtarget->isNeonAvailable())
+    return false;
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   // Currently no fixed length shuffles that require SVE are legal.
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
@@ -19838,7 +19866,8 @@ performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 // help, for example, to produce ssra from sshr+add.
 static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i64)
+  if (VT != MVT::i64 ||
+      DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
     return SDValue();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index a44a3d35d2f9c8..73bc9ad53bb8a3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1017,8 +1017,10 @@ class AArch64TargetLowering : public TargetLowering {
 
   void addTypeForNEON(MVT VT);
   void addTypeForFixedLengthSVE(MVT VT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
+  void addDRType(MVT VT);
+  void addQRType(MVT VT);
+
+  bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
 
   unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
                                   SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index ed3222529a3bb9..4cdb175f55c9cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -18,8 +18,15 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff000000ff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
@@ -37,8 +44,21 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
@@ -56,8 +76,33 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
@@ -78,9 +123,57 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xff00ff00ff00ff00
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
@@ -102,9 +195,11 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
@@ -122,8 +217,15 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
@@ -141,8 +243,21 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
@@ -163,9 +278,33 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffff0000ffff0000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
@@ -183,9 +322,11 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
@@ -203,8 +344,13 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
@@ -225,9 +371,17 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffffffff00000000
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
@@ -245,7 +399,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
@@ -265,8 +423,16 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
-; NONEON-NOSVE-NEXT:    mov v1.d[0], xzr
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index cd6c2b489efe4c..f920efeb4892d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -22,12 +22,26 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    mov w8, #8 // =0x8
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    clz w10, w10
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    sub w9, w9, #24
+; NONEON-NOSVE-NEXT:    sub w10, w10, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w11
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -44,7 +58,42 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -61,7 +110,74 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -79,10 +195,140 @@ define void @ctlz_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    clz v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
@@ -103,12 +349,17 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    sub w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -125,7 +376,26 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -142,7 +412,42 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -160,10 +465,76 @@ define void @ctlz_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
@@ -182,7 +553,15 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -199,7 +578,20 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -217,10 +609,32 @@ define void @ctlz_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
@@ -239,23 +653,13 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #1
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #2
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #4
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #8
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #16
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushr d1, d0, #32
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    mvn v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -272,23 +676,15 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -306,42 +702,22 @@ define void @ctlz_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctlz_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #1
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #1
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #2
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #2
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #4
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #4
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #8
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #8
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #16
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #16
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #32
-; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #32
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
@@ -365,10 +741,37 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d2, x10
+; NONEON-NOSVE-NEXT:    fmov d3, x8
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v2.8b, v2.8b
+; NONEON-NOSVE-NEXT:    cnt v3.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h2, v2.8b
+; NONEON-NOSVE-NEXT:    uaddlv h3, v3.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -385,7 +788,67 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    str d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #134]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #143]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #141]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #139]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -402,7 +865,126 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #287]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #285]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -420,10 +1002,240 @@ define void @ctpop_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #576
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 592
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #543]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #542]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #541]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #539]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #538]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #537]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #535]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #534]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #533]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #531]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #530]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #529]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #527]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #525]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #523]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #521]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #519]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #517]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #515]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #513]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #575]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #573]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #571]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #569]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #567]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #565]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #563]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #561]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #559]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #557]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #555]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #553]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #549]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #547]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #545]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #544]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #576
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
@@ -443,11 +1255,23 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -464,8 +1288,39 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -482,8 +1337,67 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -501,12 +1415,128 @@ define void @ctpop_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #288]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
@@ -525,9 +1555,24 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -544,9 +1589,37 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -564,14 +1637,65 @@ define void @ctpop_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #160]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
@@ -590,10 +1714,15 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -610,10 +1739,23 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -631,16 +1773,37 @@ define void @ctpop_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
@@ -665,17 +1828,30 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #256 // =0x100
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v2.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -693,10 +1869,50 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8b, #1
-; NONEON-NOSVE-NEXT:    sub v1.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -714,10 +1930,90 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.16b, #1
-; NONEON-NOSVE-NEXT:    sub v1.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -737,15 +2033,172 @@ define void @cttz_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v3.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
@@ -766,17 +2219,19 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #65536 // =0x10000
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v2.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -794,14 +2249,30 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -819,14 +2290,50 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v1.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -846,20 +2353,92 @@ define void @cttz_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
-; NONEON-NOSVE-NEXT:    sub v3.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
@@ -879,14 +2458,17 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    dup v1.2s, w8
-; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -904,14 +2486,24 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    dup v1.4s, w8
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -931,20 +2523,40 @@ define void @cttz_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
-; NONEON-NOSVE-NEXT:    sub v3.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
@@ -964,14 +2576,14 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sub d1, d0, d1
-; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
-; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -989,14 +2601,17 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    dup v1.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -1016,22 +2631,26 @@ define void @cttz_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: cttz_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
-; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
-; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
-; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 7e93ee99ed7494..41065b36020038 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -15,8 +15,14 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #3]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w10, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
@@ -102,12 +108,22 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 6b8077053b590f..b908dd61f24014 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -34,13 +34,39 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ;
 ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
 ; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index d2bfc7d4e80969..a845c3cbdc2b6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -44,7 +44,27 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -62,9 +82,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -152,7 +172,17 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
@@ -171,9 +201,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -243,7 +273,14 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
@@ -262,9 +299,9 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
@@ -332,9 +369,9 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
@@ -407,7 +444,14 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
@@ -425,9 +469,9 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
@@ -497,7 +541,14 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
@@ -516,9 +567,9 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -586,9 +637,9 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
@@ -732,7 +783,11 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -755,7 +810,11 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -775,7 +834,11 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v8i32_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -794,7 +857,11 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; NONEON-NOSVE-LABEL: concat_v4i64_4op:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 728b85d39bb37f..2cdd4374a56c5c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -15,8 +15,28 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
@@ -33,8 +53,18 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
@@ -51,8 +81,15 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
@@ -77,13 +114,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x0, x4, [sp], #16
 ; NONEON-NOSVE-NEXT:    mov x1, xzr
 ; NONEON-NOSVE-NEXT:    mov x2, xzr
 ; NONEON-NOSVE-NEXT:    mov x3, xzr
 ; NONEON-NOSVE-NEXT:    mov x5, xzr
 ; NONEON-NOSVE-NEXT:    mov x6, xzr
-; NONEON-NOSVE-NEXT:    mov x4, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
 ; NONEON-NOSVE-NEXT:    mov x7, xzr
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
@@ -110,20 +148,75 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
@@ -144,12 +237,24 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
@@ -186,34 +291,31 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x10, x8, #32
-; NONEON-NOSVE-NEXT:    add x11, x8, #96
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    st1 { v0.d }[1], [x10]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    st1 { v1.d }[1], [x11]
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    asr x10, x10, #63
-; NONEON-NOSVE-NEXT:    str d0, [x8]
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
-; NONEON-NOSVE-NEXT:    str d1, [x8, #64]
-; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #40]
-; NONEON-NOSVE-NEXT:    fmov x9, d1
-; NONEON-NOSVE-NEXT:    str x10, [x8, #8]
-; NONEON-NOSVE-NEXT:    asr x10, x11, #63
-; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x11, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x12, x13, [sp, #80]
+; NONEON-NOSVE-NEXT:    asr x10, x9, #63
+; NONEON-NOSVE-NEXT:    asr x14, x11, #63
 ; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
-; NONEON-NOSVE-NEXT:    str x10, [x8, #104]
-; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #80]
-; NONEON-NOSVE-NEXT:    str x9, [x8, #72]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp x9, x10, [x8, #96]
+; NONEON-NOSVE-NEXT:    asr x9, x13, #63
+; NONEON-NOSVE-NEXT:    asr x10, x12, #63
+; NONEON-NOSVE-NEXT:    stp x14, x14, [x8, #80]
+; NONEON-NOSVE-NEXT:    stp x11, x14, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
+; NONEON-NOSVE-NEXT:    stp x13, x9, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
+; NONEON-NOSVE-NEXT:    stp x12, x10, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
@@ -251,18 +353,26 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    dup v1.2d, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    asr x1, x0, #63
-; NONEON-NOSVE-NEXT:    asr x5, x8, #63
-; NONEON-NOSVE-NEXT:    mov x2, x1
-; NONEON-NOSVE-NEXT:    mov x3, x1
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x5
-; NONEON-NOSVE-NEXT:    mov x6, x5
-; NONEON-NOSVE-NEXT:    mov x7, x5
-; NONEON-NOSVE-NEXT:    fmov x4, d1
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    asr x8, x10, #63
+; NONEON-NOSVE-NEXT:    stp x9, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x10, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x0, x1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x2, x3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp x4, x5, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp x6, x7, [sp, #112]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
@@ -300,30 +410,88 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ;
 ; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #256]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index ec6341d6085a0a..b7b34cfa1517ce 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -31,7 +31,18 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
@@ -63,7 +74,18 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
@@ -178,8 +200,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
@@ -275,8 +301,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
@@ -331,8 +361,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index ac60a614d7ce6c..0a1831a94d8fec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -19,8 +19,11 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
@@ -36,8 +39,11 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
@@ -53,7 +59,10 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
@@ -69,7 +78,11 @@ define half @extractelement_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
@@ -86,8 +99,11 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
@@ -103,7 +119,10 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s0, v0.s[3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
@@ -119,7 +138,11 @@ define float @extractelement_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
@@ -147,7 +170,10 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d0, v0.d[1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
@@ -163,7 +189,11 @@ define double @extractelement_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index c1d84f6a15ed8c..a8d01ec7ce0b4b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -32,12 +32,58 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.4h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
@@ -68,12 +114,102 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
@@ -108,13 +244,191 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
@@ -147,12 +461,26 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    ldr d2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
@@ -183,12 +511,37 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -223,13 +576,63 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
@@ -262,12 +665,25 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -302,13 +718,39 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -347,13 +789,27 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    bsl v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -402,14 +858,39 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -447,13 +928,27 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
@@ -502,19 +997,41 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
-; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -554,13 +1071,49 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -620,21 +1173,49 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
-; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
-; NONEON-NOSVE-NEXT:    fcvt h0, d0
-; NONEON-NOSVE-NEXT:    fcvt h1, d1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, d2
-; NONEON-NOSVE-NEXT:    mov d2, v2.d[1]
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr d2, [x0]
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    dup v1.4h, w8
-; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -682,14 +1263,83 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ;
 ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
-; NONEON-NOSVE-NEXT:    dup v1.8h, w8
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index b51b89d08844d0..e84acfc8504a95 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -21,10 +21,39 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -42,10 +71,39 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -63,14 +121,66 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -90,25 +200,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -129,7 +341,17 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -147,7 +369,22 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -167,11 +404,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -192,7 +457,16 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -212,11 +486,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -241,10 +531,39 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -262,10 +581,39 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -283,14 +631,66 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -310,26 +710,127 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v5.4s, v4.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v4.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldr q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl2 v6.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fdiv v3.4s, v3.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fdiv v5.4s, v6.4s, v5.4s
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -350,7 +851,17 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -368,7 +879,22 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -388,11 +914,39 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fdiv v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -413,7 +967,16 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -433,11 +996,27 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fdiv v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fdiv v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -463,42 +1042,48 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
@@ -517,42 +1102,48 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s4, h17
-; NONEON-NOSVE-NEXT:    fcvt s5, h18
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h16
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
@@ -571,75 +1162,84 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    fcvt s16, h18
-; NONEON-NOSVE-NEXT:    fcvt s17, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s4, s5, s4, s3
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmadd s6, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s7, h18
-; NONEON-NOSVE-NEXT:    fcvt s16, h19
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov v3.h[1], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    fmadd s5, s16, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov v3.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fmadd s17, s19, s18, s17
-; NONEON-NOSVE-NEXT:    mov h18, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s4, s16, s7, s4
-; NONEON-NOSVE-NEXT:    mov v3.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h18
-; NONEON-NOSVE-NEXT:    fcvt s7, h19
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fmadd s5, s7, s6, s5
-; NONEON-NOSVE-NEXT:    mov v3.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v3.h[5], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h4, s5
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v3.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov v3.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
@@ -660,146 +1260,161 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q5, q2, [x2]
-; NONEON-NOSVE-NEXT:    mov h25, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s18, h1
-; NONEON-NOSVE-NEXT:    mov h22, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    mov h20, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[1]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[1]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s25, h25
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h29, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s23, h17
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s21, h16
-; NONEON-NOSVE-NEXT:    fmadd s6, s19, s18, s6
-; NONEON-NOSVE-NEXT:    fcvt s18, h20
-; NONEON-NOSVE-NEXT:    fcvt s19, h22
-; NONEON-NOSVE-NEXT:    fcvt s20, h24
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s22, h5
-; NONEON-NOSVE-NEXT:    fcvt s24, h4
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fmadd s21, s25, s23, s21
-; NONEON-NOSVE-NEXT:    fcvt s23, h3
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[2]
-; NONEON-NOSVE-NEXT:    fmadd s18, s20, s19, s18
-; NONEON-NOSVE-NEXT:    mov h19, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[3]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmadd s22, s23, s24, s22
-; NONEON-NOSVE-NEXT:    fcvt h20, s21
-; NONEON-NOSVE-NEXT:    mov h21, v2.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt s24, h29
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fmadd s16, s17, s16, s7
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt h26, s26
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[1], v20.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s17, h21
-; NONEON-NOSVE-NEXT:    fcvt s20, h30
-; NONEON-NOSVE-NEXT:    fmadd s19, s19, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s21, h31
-; NONEON-NOSVE-NEXT:    fcvt h7, s22
-; NONEON-NOSVE-NEXT:    fcvt s22, h25
-; NONEON-NOSVE-NEXT:    fcvt s23, h27
-; NONEON-NOSVE-NEXT:    fcvt s24, h28
-; NONEON-NOSVE-NEXT:    mov h25, v5.h[4]
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[4]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov v6.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s17, s21, s20, s17
-; NONEON-NOSVE-NEXT:    mov v7.h[1], v26.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s24, s23, s22
-; NONEON-NOSVE-NEXT:    mov h26, v5.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s20, h25
-; NONEON-NOSVE-NEXT:    fcvt s21, h27
-; NONEON-NOSVE-NEXT:    fcvt s22, h28
-; NONEON-NOSVE-NEXT:    mov h27, v4.h[5]
-; NONEON-NOSVE-NEXT:    mov h28, v3.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s23, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    mov h29, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h30, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h31, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov v7.h[2], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov h20, v5.h[6]
-; NONEON-NOSVE-NEXT:    mov h21, v4.h[6]
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s26, h26
-; NONEON-NOSVE-NEXT:    fmadd s23, s25, s24, s23
-; NONEON-NOSVE-NEXT:    fcvt s27, h27
-; NONEON-NOSVE-NEXT:    fcvt s28, h28
-; NONEON-NOSVE-NEXT:    mov v6.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s17
-; NONEON-NOSVE-NEXT:    fcvt s17, h29
-; NONEON-NOSVE-NEXT:    fcvt s24, h30
-; NONEON-NOSVE-NEXT:    fcvt s25, h31
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov v7.h[3], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
-; NONEON-NOSVE-NEXT:    fcvt h18, s19
-; NONEON-NOSVE-NEXT:    mov h5, v5.h[7]
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[7]
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s17, s25, s24, s17
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
-; NONEON-NOSVE-NEXT:    mov v6.h[4], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s23
-; NONEON-NOSVE-NEXT:    mov v7.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h18, s26
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #56]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v6.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    mov v7.h[5], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmadd s3, s3, s4, s5
-; NONEON-NOSVE-NEXT:    fcvt h4, s19
-; NONEON-NOSVE-NEXT:    fcvt h5, s17
-; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
-; NONEON-NOSVE-NEXT:    mov v7.h[6], v4.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v6.h[6], v5.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v6.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -822,8 +1437,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
@@ -842,8 +1468,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
@@ -864,12 +1508,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -892,8 +1569,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
@@ -914,12 +1602,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -945,10 +1652,39 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -966,10 +1702,39 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -987,14 +1752,66 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -1014,25 +1831,127 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1053,7 +1972,17 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -1071,7 +2000,22 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -1091,11 +2035,39 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1116,7 +2088,16 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -1136,11 +2117,27 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmul v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1164,8 +2161,30 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
@@ -1182,8 +2201,30 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
@@ -1200,8 +2241,50 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v1.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
@@ -1219,11 +2302,92 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
@@ -1242,7 +2406,15 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
@@ -1259,7 +2431,20 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
@@ -1277,10 +2462,32 @@ define void @fneg_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fneg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
@@ -1299,7 +2506,15 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
@@ -1317,10 +2532,22 @@ define void @fneg_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fneg_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fneg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
@@ -1343,26 +2570,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1379,26 +2610,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fsqrt s4, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1415,44 +2650,50 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h0
-; NONEON-NOSVE-NEXT:    fcvt h0, s2
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s3
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h1, s4
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h1, s5
-; NONEON-NOSVE-NEXT:    mov v0.h[4], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h1, s6
-; NONEON-NOSVE-NEXT:    mov v0.h[5], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h1, s7
-; NONEON-NOSVE-NEXT:    mov v0.h[6], v1.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s2, s16
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v0.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1470,85 +2711,92 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q16, [x0]
-; NONEON-NOSVE-NEXT:    mov h0, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h17, v16.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s18, h16
-; NONEON-NOSVE-NEXT:    mov h19, v16.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h20, v16.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h21, v16.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h22, v16.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fsqrt s2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s21, h21
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s22, h22
-; NONEON-NOSVE-NEXT:    mov h23, v16.h[6]
-; NONEON-NOSVE-NEXT:    mov h16, v16.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s23, h23
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fsqrt s0, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s17, s17
-; NONEON-NOSVE-NEXT:    fcvt h17, s17
-; NONEON-NOSVE-NEXT:    fsqrt s18, s18
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    mov v18.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s3, s3
-; NONEON-NOSVE-NEXT:    fcvt h0, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s19, s19
-; NONEON-NOSVE-NEXT:    fcvt h17, s19
-; NONEON-NOSVE-NEXT:    mov v18.h[2], v17.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s4, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s20, s20
-; NONEON-NOSVE-NEXT:    fcvt h3, s20
-; NONEON-NOSVE-NEXT:    mov v18.h[3], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s5, s5
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s21, s21
-; NONEON-NOSVE-NEXT:    fcvt h3, s21
-; NONEON-NOSVE-NEXT:    mov v18.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s6, s6
-; NONEON-NOSVE-NEXT:    fcvt h0, s6
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s22, s22
-; NONEON-NOSVE-NEXT:    fcvt h3, s22
-; NONEON-NOSVE-NEXT:    mov v18.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s7, s7
-; NONEON-NOSVE-NEXT:    fcvt h0, s7
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v0.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s23, s23
-; NONEON-NOSVE-NEXT:    fcvt h3, s23
-; NONEON-NOSVE-NEXT:    mov v18.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s16, s16
-; NONEON-NOSVE-NEXT:    fcvt h3, s16
-; NONEON-NOSVE-NEXT:    mov v18.h[7], v3.h[0]
-; NONEON-NOSVE-NEXT:    fsqrt s1, s1
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q18, q2, [x0]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
@@ -1567,7 +2815,15 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1584,7 +2840,20 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1602,10 +2871,32 @@ define void @fsqrt_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsqrt v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
@@ -1624,7 +2915,15 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1642,10 +2941,22 @@ define void @fsqrt_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fsqrt_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsqrt v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
@@ -1669,10 +2980,39 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
@@ -1690,10 +3030,39 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
@@ -1711,14 +3080,66 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
@@ -1738,25 +3159,127 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fsub v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fsub v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1777,7 +3300,17 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
@@ -1795,7 +3328,22 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
@@ -1815,11 +3363,39 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fsub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1840,7 +3416,16 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
@@ -1860,11 +3445,27 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fsub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fsub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1888,7 +3489,30 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1905,7 +3529,30 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1922,7 +3569,50 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1940,10 +3630,92 @@ define void @fabs_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
-; NONEON-NOSVE-NEXT:    bic v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
@@ -1962,7 +3734,15 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1979,7 +3759,20 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1997,10 +3790,32 @@ define void @fabs_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fabs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
@@ -2019,7 +3834,15 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -2037,10 +3860,22 @@ define void @fabs_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fabs_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fabs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index c5ed70c8a5f2f8..776b6918923ae9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -23,10 +23,24 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
@@ -46,10 +60,39 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
@@ -69,61 +112,66 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s2, s5
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
@@ -145,119 +193,127 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -280,7 +336,18 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
@@ -300,7 +367,24 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -322,11 +406,43 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -347,7 +463,13 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcmp d0, d1
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
@@ -367,7 +489,17 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -389,11 +521,29 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -426,135 +576,143 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
 ; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
 ; NONEON-NOSVE-NEXT:    csetm w8, eq
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -587,150 +745,158 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h2
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x1]
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s5
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s3
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h3
-; NONEON-NOSVE-NEXT:    fmov s2, w12
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmov s3, w17
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
-; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
 ; NONEON-NOSVE-NEXT:    fcmp s1, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
-; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
 ; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
-; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp one <16 x half> %op1, %op2
-  %sext = sext <16 x i1> %cmp to <16 x i16>
-  store <16 x i16> %sext, ptr %c
-  ret void
-}
-
-;
-; FCMP UNE
-;
-
-define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: fcmp_une_v16f16:
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp one <16 x half> %op1, %op2
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  store <16 x i16> %sext, ptr %c
+  ret void
+}
+
+;
+; FCMP UNE
+;
+
+define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: fcmp_une_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
@@ -744,119 +910,127 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -885,119 +1059,127 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1029,119 +1211,127 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, hi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, hi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, hi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, hi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, hi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, hi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1170,123 +1360,131 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, mi
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, mi
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, mi
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, mi
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, mi
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, mi
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp olt <16 x half> %op1, %op2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp olt <16 x half> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i16>
   store <16 x i16> %sext, ptr %c
   ret void
@@ -1314,119 +1512,127 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1455,119 +1661,127 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1599,119 +1813,127 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, pl
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, pl
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, pl
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, pl
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, pl
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, pl
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1740,263 +1962,279 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ls
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ls
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ls
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ls
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ls
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
+  %op1 = load <16 x half>, ptr %a
+  %op2 = load <16 x half>, ptr %b
+  %cmp = fcmp ole <16 x half> %op1, %op2
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  store <16 x i16> %sext, ptr %c
+  ret void
+}
+
+;
+; FCMP ULE
+;
+
+define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: fcmp_ule_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-NEXT:    eor z0.d, z2.d, z0.d
+; CHECK-NEXT:    stp q1, q0, [x2]
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, ls
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
-; NONEON-NOSVE-NEXT:    ret
-  %op1 = load <16 x half>, ptr %a
-  %op2 = load <16 x half>, ptr %b
-  %cmp = fcmp ole <16 x half> %op1, %op2
-  %sext = sext <16 x i1> %cmp to <16 x i16>
-  store <16 x i16> %sext, ptr %c
-  ret void
-}
-
-;
-; FCMP ULE
-;
-
-define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: fcmp_ule_v16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
-; CHECK-NEXT:    eor z0.d, z2.d, z0.d
-; CHECK-NEXT:    stp q1, q0, [x2]
-; CHECK-NEXT:    ret
-;
-; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
-; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2025,119 +2263,127 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vs
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vs
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vs
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vs
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vs
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, vs
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2169,119 +2415,127 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, vc
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, vc
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, vc
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, vc
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, vc
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, vc
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2310,119 +2564,127 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, eq
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2451,119 +2713,127 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ne
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ne
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ne
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ne
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ne
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2592,119 +2862,127 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, gt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, gt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, gt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, gt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, gt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, gt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2733,119 +3011,127 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, lt
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, lt
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, lt
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, lt
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, lt
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, lt
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -2874,119 +3160,127 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, ge
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, ge
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, ge
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, ge
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, ge
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, ge
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -3015,119 +3309,127 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h2
-; NONEON-NOSVE-NEXT:    fcvt s7, h1
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w12, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w11, le
-; NONEON-NOSVE-NEXT:    fcmp s3, s0
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w9, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    csetm w10, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    csetm w13, le
-; NONEON-NOSVE-NEXT:    fcmp s7, s3
-; NONEON-NOSVE-NEXT:    fmov s7, w12
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    csetm w14, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s2
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
-; NONEON-NOSVE-NEXT:    csetm w15, le
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
-; NONEON-NOSVE-NEXT:    csetm w16, le
-; NONEON-NOSVE-NEXT:    fcmp s5, s2
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    csetm w17, le
-; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
-; NONEON-NOSVE-NEXT:    fmov s2, w17
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
-; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s6, s5
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    fcmp s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
 ; NONEON-NOSVE-NEXT:    csetm w8, le
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 055af194be211a..2c08977320e848 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -21,13 +21,28 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #8.00000000
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s2, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w11, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s3, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w12, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s2, #3
+; NONEON-NOSVE-NEXT:    stp w11, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s3, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s0, #3
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w10, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index ce8902cfa16c3d..9878910763a751 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -21,8 +21,16 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
@@ -41,8 +49,22 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
@@ -64,13 +86,33 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
@@ -99,17 +141,57 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
@@ -132,9 +214,20 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
@@ -153,9 +246,23 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
@@ -178,13 +285,33 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
@@ -214,17 +341,57 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
@@ -246,9 +413,14 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldr h0, [x0]
 ; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
@@ -267,10 +439,26 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
@@ -292,15 +480,35 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
@@ -329,22 +537,61 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
@@ -390,34 +637,115 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    fcvtl v5.2d, v5.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v7.2s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v6.2s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
@@ -440,7 +768,7 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
@@ -460,9 +788,18 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
@@ -485,13 +822,23 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
@@ -521,17 +868,37 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
@@ -554,9 +921,21 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
@@ -576,8 +955,23 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
@@ -599,10 +993,36 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
@@ -647,11 +1067,19 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, d0
-; NONEON-NOSVE-NEXT:    fcvt h1, d1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
@@ -673,17 +1101,24 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    mov d1, v0.d[1]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, d0
-; NONEON-NOSVE-NEXT:    fcvt h1, d1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, d2
-; NONEON-NOSVE-NEXT:    mov d2, v2.d[1]
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, d2
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
@@ -706,8 +1141,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
 ; NONEON-NOSVE-NEXT:    str s0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
@@ -726,8 +1160,16 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -748,10 +1190,22 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 9d2b55903f3141..680cb4fb0a7910 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -21,14 +21,59 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fmul s1, s3, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmul s2, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmul s2, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
@@ -48,22 +93,107 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fmul v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s3, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    fcvt s23, h23
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmul s5, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmul s3, s4, s3
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s23, s22
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    str h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s17, s16
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    str h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
@@ -85,42 +215,228 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x2]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp d15, d14, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d13, d12, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d11, d10, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d9, d8, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset b8, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset b9, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset b10, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset b11, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset b12, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset b13, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset b14, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset b15, -64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q19, [x2]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h15, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s20, h0
+; NONEON-NOSVE-NEXT:    fcvt s21, h1
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    ldr h13, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h14, [sp, #74]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h12, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h10, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmul s30, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h31, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h19, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h15
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fmul s0, s0, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h14
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s16, s17, s16
+; NONEON-NOSVE-NEXT:    fmul s6, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s18, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s19, h13
+; NONEON-NOSVE-NEXT:    fmul s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldp d15, d14, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fmul s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14] // 2-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    str h18, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fmul s1, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s18
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h11
+; NONEON-NOSVE-NEXT:    fcvt s30, h12
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    ldp d13, d12, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h9
+; NONEON-NOSVE-NEXT:    fcvt s30, h10
+; NONEON-NOSVE-NEXT:    ldp d11, d10, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h31
+; NONEON-NOSVE-NEXT:    fcvt s30, h8
+; NONEON-NOSVE-NEXT:    ldp d9, d8, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h28
+; NONEON-NOSVE-NEXT:    fcvt s28, h29
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s28, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h26
+; NONEON-NOSVE-NEXT:    fcvt s26, h27
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s26, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    fcvt s24, h25
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s24, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s22, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h20
+; NONEON-NOSVE-NEXT:    fcvt s20, h21
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s20, s19
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s16, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s6, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -144,8 +460,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
@@ -165,8 +492,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
@@ -188,12 +533,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -212,7 +590,12 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
@@ -232,8 +615,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ;
 ; NONEON-NOSVE-LABEL: fma_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
@@ -255,12 +649,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index a96adfec2ad105..775cac272cde9d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -21,34 +21,39 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -66,60 +71,66 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -139,115 +150,127 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmaxnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmaxnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmaxnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -268,7 +291,17 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -286,7 +319,22 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -306,11 +354,39 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -327,7 +403,12 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -345,7 +426,16 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -365,11 +455,27 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -394,34 +500,39 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -439,60 +550,66 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fminnm s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fminnm s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fminnm s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -512,115 +629,127 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fminnm s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fminnm s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s6, s16, s6
-; NONEON-NOSVE-NEXT:    fminnm s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fminnm s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fminnm s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fminnm s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fminnm s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -641,7 +770,17 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -659,7 +798,22 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -679,11 +833,39 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -700,7 +882,12 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -718,7 +905,16 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -738,11 +934,27 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -767,34 +979,39 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -812,60 +1029,66 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmax s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmax s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmax s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmax s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -885,115 +1108,127 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmax s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmax s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmax s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmax s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmax s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmax s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmax s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmax s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmax s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmax s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmax s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmax s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmax s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1014,7 +1249,17 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -1032,7 +1277,22 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -1052,11 +1312,39 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1073,7 +1361,12 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -1091,7 +1384,16 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -1111,11 +1413,27 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmax v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -1140,34 +1458,39 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h4
-; NONEON-NOSVE-NEXT:    fcvt s4, h5
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s3, s4, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h0, s5
-; NONEON-NOSVE-NEXT:    fcvt s4, h6
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h2, s3
-; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
@@ -1185,60 +1508,66 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fmin s3, s3, s2
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s4
-; NONEON-NOSVE-NEXT:    fmin s4, s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fmin s5, s5, s16
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s3, h6
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h5, s5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fmin s3, s6, s3
-; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h6
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    fcvt h3, s6
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
@@ -1258,115 +1587,127 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s19, h0
-; NONEON-NOSVE-NEXT:    fcvt s20, h3
-; NONEON-NOSVE-NEXT:    fcvt s21, h2
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fmin s4, s19, s4
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmin s20, s21, s20
-; NONEON-NOSVE-NEXT:    fcvt s21, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
-; NONEON-NOSVE-NEXT:    fmin s5, s7, s5
-; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmin s6, s16, s6
-; NONEON-NOSVE-NEXT:    fmin s16, s18, s17
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s18, h19
-; NONEON-NOSVE-NEXT:    fcvt s19, h24
-; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h17, s5
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcvt h5, s20
-; NONEON-NOSVE-NEXT:    fmin s20, s22, s21
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt s21, h23
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
-; NONEON-NOSVE-NEXT:    fmin s7, s18, s7
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s20
-; NONEON-NOSVE-NEXT:    fmin s19, s21, s19
-; NONEON-NOSVE-NEXT:    fcvt s20, h23
-; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s6, h17
-; NONEON-NOSVE-NEXT:    fcvt s17, h22
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fcvt s18, h18
-; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h16, s19
-; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s6, s17, s6
-; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmin s18, s20, s18
-; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s7, h22
-; NONEON-NOSVE-NEXT:    fcvt s22, h23
-; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt s16, h21
-; NONEON-NOSVE-NEXT:    fcvt s21, h24
-; NONEON-NOSVE-NEXT:    fcvt s19, h19
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcvt s23, h25
-; NONEON-NOSVE-NEXT:    fcvt h18, s18
-; NONEON-NOSVE-NEXT:    fcvt s20, h20
-; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
-; NONEON-NOSVE-NEXT:    fmin s7, s22, s7
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmin s16, s21, s16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
-; NONEON-NOSVE-NEXT:    fmin s6, s19, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
-; NONEON-NOSVE-NEXT:    fmin s17, s23, s20
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt h7, s7
-; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fcvt h16, s16
-; NONEON-NOSVE-NEXT:    fcvt h6, s6
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
-; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
-; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -1387,7 +1728,17 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
@@ -1405,7 +1756,22 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
@@ -1425,11 +1791,39 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -1446,7 +1840,12 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
@@ -1464,7 +1863,16 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
@@ -1484,11 +1892,27 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmin v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index f1561011e21812..f081d4ac65b279 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -30,26 +30,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index a0a7dad835662e..4eaaee7ce5055d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -23,26 +23,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
@@ -71,45 +75,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
@@ -154,86 +162,93 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
@@ -251,10 +266,13 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
@@ -275,13 +293,15 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
@@ -310,22 +330,25 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
-; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
-; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
@@ -357,9 +380,11 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
@@ -380,13 +405,19 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fadda_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov d2, v3.d[1]
-; NONEON-NOSVE-NEXT:    fadd d0, d0, d3
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
-; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
@@ -408,26 +439,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
@@ -444,45 +479,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
@@ -500,54 +539,90 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fadd v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    mov h1, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s3, h2
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
-; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
-; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
@@ -565,8 +640,13 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
@@ -583,8 +663,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
@@ -604,10 +689,21 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; NONEON-NOSVE-LABEL: faddv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp]
+; NONEON-NOSVE-NEXT:    ldp s5, s6, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s7, s16, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s7, s5
+; NONEON-NOSVE-NEXT:    fadd s4, s16, s6
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
 ; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
@@ -639,7 +735,10 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: faddv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
@@ -659,8 +758,13 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; NONEON-NOSVE-LABEL: faddv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v1.2d
-; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d4, d3, [sp], #32
+; NONEON-NOSVE-NEXT:    fadd d1, d3, d1
+; NONEON-NOSVE-NEXT:    fadd d2, d4, d2
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
 ; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
@@ -683,22 +787,26 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
@@ -715,41 +823,45 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
@@ -767,81 +879,86 @@ define half @fmaxv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
@@ -859,7 +976,12 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
@@ -876,7 +998,14 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
@@ -895,8 +1024,20 @@ define float @fmaxv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaxv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
@@ -926,7 +1067,10 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaxv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
@@ -945,8 +1089,13 @@ define double @fmaxv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaxv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmaxnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
@@ -968,22 +1117,26 @@ define half @fminv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
@@ -1000,41 +1153,45 @@ define half @fminv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
@@ -1052,81 +1209,86 @@ define half @fminv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fminnm s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fminnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
@@ -1144,7 +1306,12 @@ define float @fminv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
@@ -1161,7 +1328,14 @@ define float @fminv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
@@ -1180,8 +1354,20 @@ define float @fminv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
@@ -1211,7 +1397,10 @@ define double @fminv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
@@ -1230,8 +1419,13 @@ define double @fminv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fminnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
@@ -1253,22 +1447,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
@@ -1285,41 +1483,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
@@ -1337,81 +1539,86 @@ define half @fmaximumv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmax s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmax s3, s5, s4
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
@@ -1429,7 +1636,12 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
@@ -1446,7 +1658,14 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
@@ -1465,8 +1684,20 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
@@ -1496,7 +1727,10 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
@@ -1515,8 +1749,13 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmax d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmax d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
@@ -1538,22 +1777,26 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
@@ -1570,41 +1813,45 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
-; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
@@ -1622,81 +1869,86 @@ define half @fminimumv_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s4, h1
-; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
-; NONEON-NOSVE-NEXT:    fmin s2, s4, s2
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
-; NONEON-NOSVE-NEXT:    fcvt h4, s4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h3, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
 ; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fmin s3, s5, s4
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
-; NONEON-NOSVE-NEXT:    fcvt h2, s2
-; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
-; NONEON-NOSVE-NEXT:    fcvt h1, s2
 ; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
@@ -1714,7 +1966,12 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp s0, v0.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
@@ -1731,7 +1988,14 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
@@ -1750,8 +2014,20 @@ define float @fminimumv_v8f32(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminimumv_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
@@ -1781,7 +2057,10 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ;
 ; NONEON-NOSVE-LABEL: fminimumv_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
@@ -1800,8 +2079,13 @@ define double @fminimumv_v4f64(ptr %a) {
 ; NONEON-NOSVE-LABEL: fminimumv_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmin d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmin d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 6af2b885ace08f..344aac5b198384 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -20,9 +20,30 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -39,9 +60,30 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -58,12 +100,50 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -81,20 +161,92 @@ define void @frintp_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintp v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintp v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
@@ -113,7 +265,15 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -130,7 +290,20 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -148,10 +321,32 @@ define void @frintp_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
@@ -167,7 +362,12 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -184,7 +384,15 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -202,10 +410,22 @@ define void @frintp_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintp_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintp v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
@@ -228,9 +448,30 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -247,9 +488,30 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -266,12 +528,50 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -289,20 +589,92 @@ define void @frintm_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintm v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintm v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
@@ -321,7 +693,15 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -338,7 +718,20 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -356,10 +749,32 @@ define void @frintm_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
@@ -375,7 +790,12 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -392,7 +812,15 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -410,10 +838,22 @@ define void @frintm_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintm v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
@@ -436,9 +876,30 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -455,9 +916,30 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -474,12 +956,50 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -497,20 +1017,92 @@ define void @frinti_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinti v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinti v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
@@ -529,7 +1121,15 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -546,7 +1146,20 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -564,10 +1177,32 @@ define void @frinti_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
@@ -583,7 +1218,12 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -600,7 +1240,15 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -618,10 +1266,22 @@ define void @frinti_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinti_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinti v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
@@ -644,9 +1304,30 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -663,9 +1344,30 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -682,12 +1384,50 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -705,20 +1445,92 @@ define void @frintx_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintx v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintx v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
@@ -737,7 +1549,15 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -754,7 +1574,20 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -772,10 +1605,32 @@ define void @frintx_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
@@ -791,7 +1646,12 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -808,7 +1668,15 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -826,10 +1694,22 @@ define void @frintx_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintx_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintx v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
@@ -852,9 +1732,30 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -871,9 +1772,30 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -890,12 +1812,50 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -913,20 +1873,92 @@ define void @frinta_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frinta v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frinta v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
@@ -945,7 +1977,15 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -962,7 +2002,20 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -980,10 +2033,32 @@ define void @frinta_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
@@ -999,7 +2074,12 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1016,7 +2096,15 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1034,10 +2122,22 @@ define void @frinta_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frinta_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frinta v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
@@ -1060,9 +2160,30 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1079,9 +2200,30 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1098,12 +2240,50 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1121,20 +2301,92 @@ define void @frintn_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintn v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintn v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
@@ -1153,7 +2405,15 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1170,7 +2430,20 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1188,10 +2461,32 @@ define void @frintn_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
@@ -1207,7 +2502,12 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1224,7 +2524,15 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1242,10 +2550,22 @@ define void @frintn_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintn_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintn v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
@@ -1268,9 +2588,30 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
@@ -1287,9 +2628,30 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
@@ -1306,12 +2668,50 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
@@ -1329,20 +2729,92 @@ define void @frintz_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    frintz v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    frintz v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
@@ -1361,7 +2833,15 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
@@ -1378,7 +2858,20 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
@@ -1396,10 +2889,32 @@ define void @frintz_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
@@ -1415,7 +2930,12 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
@@ -1432,7 +2952,15 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
@@ -1450,10 +2978,22 @@ define void @frintz_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: frintz_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    frintz v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 824419b31a5a83..daa9b51cc827b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -20,10 +20,28 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
@@ -44,10 +62,28 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
@@ -68,10 +104,43 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
@@ -95,16 +164,83 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
@@ -128,10 +264,18 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
@@ -152,10 +296,23 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
@@ -179,16 +336,43 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
@@ -206,10 +390,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
@@ -231,10 +418,17 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
@@ -259,16 +453,31 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index c853bdc5af8db0..0d92a6fa0fa28d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -19,9 +19,26 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -39,16 +56,43 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
@@ -69,22 +113,75 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
@@ -108,9 +205,17 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -128,8 +233,25 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -151,15 +273,41 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
@@ -189,21 +337,73 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
@@ -224,9 +424,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -246,14 +450,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -280,23 +488,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
@@ -339,42 +551,43 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
@@ -439,76 +652,79 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzu x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzu x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzu x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzu x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzu x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzu x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzu x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
@@ -531,7 +747,14 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -549,8 +772,20 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -572,10 +807,31 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
@@ -604,15 +860,56 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
@@ -635,7 +932,14 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -652,7 +956,18 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -670,10 +985,28 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
@@ -697,9 +1030,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -717,8 +1054,15 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -740,15 +1084,21 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
@@ -778,21 +1128,33 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
@@ -814,8 +1176,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -833,8 +1199,14 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -867,11 +1239,27 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
@@ -919,19 +1307,49 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
@@ -1012,31 +1430,90 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
@@ -1060,9 +1537,12 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -1080,8 +1560,14 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1103,10 +1589,19 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
@@ -1135,15 +1630,32 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
@@ -1166,8 +1678,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1184,7 +1700,14 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1202,10 +1725,20 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
@@ -1228,9 +1761,26 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1248,16 +1798,43 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
@@ -1278,22 +1855,75 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
@@ -1317,9 +1947,17 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1337,8 +1975,25 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1360,15 +2015,41 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
@@ -1398,21 +2079,73 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
@@ -1433,9 +2166,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1456,14 +2193,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1490,23 +2231,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
 ; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s3
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
@@ -1549,42 +2294,43 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fcvt s1, h7
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s3
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s6
-; NONEON-NOSVE-NEXT:    fmov d3, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d2, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
@@ -1649,76 +2395,79 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s3, h1
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
-; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s6, h0
-; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h4
-; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s3
-; NONEON-NOSVE-NEXT:    fcvt s3, h5
-; NONEON-NOSVE-NEXT:    fcvt s5, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s6
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s2
-; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
-; NONEON-NOSVE-NEXT:    fcvtzs x10, s17
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s5
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s7
-; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fmov d0, x11
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s1
-; NONEON-NOSVE-NEXT:    fmov d1, x13
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvtzs x13, s16
-; NONEON-NOSVE-NEXT:    fmov d16, x9
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvtzs x15, s17
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
-; NONEON-NOSVE-NEXT:    fcvtzs x9, s2
-; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
-; NONEON-NOSVE-NEXT:    fcvtzs x8, s6
-; NONEON-NOSVE-NEXT:    fcvtzs x14, s4
-; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
-; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
-; NONEON-NOSVE-NEXT:    fmov d3, x10
-; NONEON-NOSVE-NEXT:    fcvtzs x11, s5
-; NONEON-NOSVE-NEXT:    fmov d2, x15
-; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    fmov d4, x8
-; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
-; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
-; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
-; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
-; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
@@ -1741,7 +2490,14 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -1759,8 +2515,20 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1782,10 +2550,31 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
@@ -1814,15 +2603,56 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
@@ -1845,7 +2675,14 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -1862,7 +2699,18 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1880,10 +2728,28 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
@@ -1907,9 +2773,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -1927,8 +2797,15 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -1950,15 +2827,21 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
@@ -1988,21 +2871,33 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
-; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
-; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
-; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
@@ -2026,8 +2921,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
-; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
@@ -2045,8 +2944,14 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
@@ -2079,11 +2984,27 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
@@ -2131,19 +3052,49 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI61_0
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI61_0]
-; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
@@ -2224,31 +3175,90 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI62_0
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
-; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI62_0]
-; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
-; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
-; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
-; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
-; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
-; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
-; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
@@ -2272,9 +3282,12 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
@@ -2292,8 +3305,14 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
@@ -2315,10 +3334,19 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
@@ -2347,15 +3375,32 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
@@ -2378,8 +3423,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
@@ -2396,7 +3445,14 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
@@ -2414,10 +3470,20 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index d3b09374676556..69661049bcb6f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -31,10 +31,27 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
@@ -57,9 +74,40 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
@@ -83,10 +131,68 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ;
 ; NONEON-NOSVE-LABEL: select_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
@@ -107,122 +213,126 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s6, h1
-; NONEON-NOSVE-NEXT:    fcvt s7, h0
-; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov h17, v0.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s6, h4
+; NONEON-NOSVE-NEXT:    fcvt s7, h5
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s18, h17
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #34]
 ; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
-; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
-; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s2, h16
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
 ; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt s3, h3
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w12, eq
-; NONEON-NOSVE-NEXT:    fcmp s3, s2
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w11, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov h7, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov h18, v3.h[3]
-; NONEON-NOSVE-NEXT:    csetm w13, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    mov h4, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov h5, v2.h[1]
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcvt s7, h3
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s1, s5, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s18, h21
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #28]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcsel s2, s17, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s20, s7
+; NONEON-NOSVE-NEXT:    fcvt s16, h5
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcsel s3, s19, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s4
+; NONEON-NOSVE-NEXT:    fcvt s19, h7
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcsel s4, s21, s6, eq
 ; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s4, h4
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
-; NONEON-NOSVE-NEXT:    fcvt s5, h5
-; NONEON-NOSVE-NEXT:    csetm w10, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h3
-; NONEON-NOSVE-NEXT:    fcvt s7, h2
-; NONEON-NOSVE-NEXT:    csetm w15, eq
-; NONEON-NOSVE-NEXT:    fcmp s5, s4
-; NONEON-NOSVE-NEXT:    fmov s4, w14
-; NONEON-NOSVE-NEXT:    csetm w16, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    fcvt s16, h17
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w8
 ; NONEON-NOSVE-NEXT:    fcvt s17, h18
-; NONEON-NOSVE-NEXT:    csetm w14, eq
-; NONEON-NOSVE-NEXT:    fmov s5, w14
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcmp s16, s7
-; NONEON-NOSVE-NEXT:    mov h7, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov h16, v2.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w12
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w16
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s6, s17
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[5]
-; NONEON-NOSVE-NEXT:    fcvt s6, h7
-; NONEON-NOSVE-NEXT:    fcvt s7, h16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w11
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s17, h17
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov h6, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov h7, v2.h[6]
-; NONEON-NOSVE-NEXT:    fcvt s16, h16
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w13
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w8
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcvt s6, h6
-; NONEON-NOSVE-NEXT:    fcvt s7, h7
-; NONEON-NOSVE-NEXT:    fcmp s17, s16
-; NONEON-NOSVE-NEXT:    mov h16, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov h17, v2.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    fcvt s6, h16
-; NONEON-NOSVE-NEXT:    fcvt s7, h17
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w10
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    fcmp s7, s6
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w8
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w15
-; NONEON-NOSVE-NEXT:    csetm w8, eq
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h3, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s5, s22, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s19
+; NONEON-NOSVE-NEXT:    fcvt s22, h16
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h4, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcsel s6, s20, s7, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s17
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    fcvt s21, h26
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h5, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcsel s7, s24, s18, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s22, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h27
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h6, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcsel s16, s25, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h18
+; NONEON-NOSVE-NEXT:    fcvt s25, h24
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h7, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s19, s26, s19, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    fcvt s26, h28
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h16, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcsel s17, s27, s17, eq
+; NONEON-NOSVE-NEXT:    fcmp s25, s21
+; NONEON-NOSVE-NEXT:    fcvt s25, h22
+; NONEON-NOSVE-NEXT:    fcvt s27, h29
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h19, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcsel s18, s24, s18, eq
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    fcvt s23, h21
+; NONEON-NOSVE-NEXT:    str h17, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s26, h24
+; NONEON-NOSVE-NEXT:    fcsel s20, s28, s20, eq
+; NONEON-NOSVE-NEXT:    fcmp s27, s25
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp]
+; NONEON-NOSVE-NEXT:    str h18, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt s17, h25
+; NONEON-NOSVE-NEXT:    fcvt s18, h27
+; NONEON-NOSVE-NEXT:    fcsel s7, s29, s22, eq
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    str h20, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcsel s16, s24, s21, eq
+; NONEON-NOSVE-NEXT:    str h7, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s18, s17
+; NONEON-NOSVE-NEXT:    str h16, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcsel s2, s27, s25, eq
+; NONEON-NOSVE-NEXT:    str h2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -249,9 +359,22 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ;
 ; NONEON-NOSVE-LABEL: select_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
@@ -275,10 +398,36 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ;
 ; NONEON-NOSVE-LABEL: select_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
@@ -299,14 +448,45 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr s17, [sp]
+; NONEON-NOSVE-NEXT:    ldp s6, s7, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    ldp s1, s5, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s2, s3, s2, eq
+; NONEON-NOSVE-NEXT:    ldp s16, s3, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s4, s1
+; NONEON-NOSVE-NEXT:    fcsel s1, s4, s1, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcsel s3, s5, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    ldr s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s4, s6, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    ldr s6, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcsel s5, s7, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s16, s6
+; NONEON-NOSVE-NEXT:    ldr s7, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s3, s4, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s6, s16, s6, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s7
+; NONEON-NOSVE-NEXT:    fcsel s3, s17, s7, eq
+; NONEON-NOSVE-NEXT:    stp s5, s6, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp s3, s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -325,10 +505,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ;
 ; NONEON-NOSVE-LABEL: select_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
@@ -352,10 +535,23 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ;
 ; NONEON-NOSVE-LABEL: select_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    sbfx x8, x9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
@@ -376,14 +572,29 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d5, d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d4, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, eq
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcsel d2, d3, d2, eq
+; NONEON-NOSVE-NEXT:    fcmp d4, d1
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d1, d4, d1, eq
+; NONEON-NOSVE-NEXT:    fcmp d5, d3
+; NONEON-NOSVE-NEXT:    fcsel d3, d5, d3, eq
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d3, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index ae97a266c6ff0d..3ba61c3335a64c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -25,10 +25,21 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
@@ -50,10 +61,23 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
@@ -75,8 +99,25 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
@@ -98,8 +139,25 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w8
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
@@ -122,10 +180,18 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
@@ -147,10 +213,21 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
@@ -172,8 +249,23 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.h[7], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
@@ -195,8 +287,23 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.h[7], w8
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
@@ -219,10 +326,18 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
@@ -244,8 +359,20 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
@@ -267,9 +394,20 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
@@ -286,8 +424,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
@@ -309,8 +451,18 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
@@ -332,9 +484,18 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x8
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
@@ -358,11 +519,14 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; NONEON-NOSVE-LABEL: insertelement_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI14_0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ld1r { v1.4h }, [x8]
-; NONEON-NOSVE-NEXT:    mov v1.h[0], v0.h[0]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [x8, :lo12:.LCPI14_0]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
@@ -384,11 +548,22 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI15_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI15_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
@@ -410,9 +585,24 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI16_0
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI16_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
@@ -434,10 +624,24 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
-; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI17_0
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI17_0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
@@ -461,10 +665,18 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    mov v0.s[1], v1.s[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
@@ -486,8 +698,20 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.s[3], v1.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
@@ -509,9 +733,21 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov s2, #5.00000000
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.s[3], v2.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
@@ -527,8 +763,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
@@ -550,8 +790,18 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d1, #5.00000000
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
@@ -573,10 +823,19 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, #5.00000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
 ; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index 1b438559e05380..a2875ffef2e88a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -20,7 +20,27 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -37,7 +57,43 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -54,7 +110,74 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -72,11 +195,143 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -96,7 +351,18 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -113,7 +379,27 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -130,7 +416,42 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -148,11 +469,79 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -172,7 +561,18 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -189,7 +589,24 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -207,11 +624,43 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -231,7 +680,14 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -248,7 +704,17 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: add_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    add v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -266,11 +732,29 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -303,7 +787,27 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -329,7 +833,43 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -355,7 +895,74 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -384,11 +991,143 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -417,7 +1156,17 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -443,7 +1192,27 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -469,7 +1238,42 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -498,11 +1302,79 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -531,7 +1403,17 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -557,7 +1439,22 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -586,11 +1483,39 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -619,12 +1544,14 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -650,14 +1577,16 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x10, d1
-; NONEON-NOSVE-NEXT:    fmov x11, d0
-; NONEON-NOSVE-NEXT:    mov x8, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -686,25 +1615,27 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x12, d2
-; NONEON-NOSVE-NEXT:    mov x11, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov x9, d3
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
-; NONEON-NOSVE-NEXT:    mul x9, x12, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mul x11, x14, x13
-; NONEON-NOSVE-NEXT:    fmov d0, x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -728,7 +1659,27 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -745,7 +1696,43 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -762,7 +1749,74 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -780,11 +1834,143 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -804,7 +1990,18 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -821,7 +2018,27 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -838,7 +2055,42 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -856,11 +2108,79 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -880,7 +2200,18 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -897,7 +2228,24 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -915,11 +2263,43 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -939,7 +2319,14 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -956,7 +2343,17 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -974,11 +2371,29 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1003,9 +2418,26 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cneg w8, w9, mi
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cneg w8, w10, mi
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cneg w8, w11, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
@@ -1022,7 +2454,42 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
@@ -1039,7 +2506,74 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
@@ -1057,10 +2591,140 @@ define void @abs_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    abs v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
@@ -1080,9 +2744,17 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w9, mi
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
@@ -1099,7 +2771,26 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
@@ -1116,7 +2807,42 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
@@ -1134,10 +2860,76 @@ define void @abs_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    abs v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
@@ -1156,7 +2948,17 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
@@ -1173,7 +2975,24 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
@@ -1191,10 +3010,40 @@ define void @abs_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
@@ -1213,7 +3062,14 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs d0, d0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
@@ -1230,7 +3086,17 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
@@ -1248,10 +3114,26 @@ define void @abs_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index ee0ca0e60b5e51..0b4316686fff64 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -22,7 +22,51 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
@@ -42,7 +86,90 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
@@ -64,11 +191,175 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -91,7 +382,31 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
@@ -111,7 +426,50 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
@@ -133,11 +491,95 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmeq v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -160,7 +602,19 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
@@ -180,7 +634,26 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -202,11 +675,47 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -229,7 +738,15 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
@@ -249,7 +766,18 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -271,11 +799,31 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -304,13 +852,175 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -337,10 +1047,53 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -369,11 +1122,95 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmgt v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -400,10 +1237,29 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
@@ -432,11 +1288,47 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -463,10 +1355,21 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hs
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hs
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -493,10 +1396,21 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -523,10 +1437,21 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhs v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, ls
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, ls
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -553,10 +1478,21 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index d79d6c18ed5a6e..e09b1613a54afb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -28,27 +28,27 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
 ; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -80,41 +80,43 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -166,71 +168,74 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    sdiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -315,159 +320,143 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    smov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    smov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    smov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    sdiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    sdiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    sdiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    smov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    sdiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    smov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    sdiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    sdiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    smov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    sdiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    smov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    sdiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    sdiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    smov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    smov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    sdiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    sdiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    smov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    sdiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    smov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    smov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    sdiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    smov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    smov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    sdiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -490,19 +479,18 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -523,25 +511,27 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -572,39 +562,42 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -649,75 +642,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    smov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    smov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    smov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    smov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    sdiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    smov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    sdiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    sdiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    smov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    smov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    sdiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -738,17 +735,17 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -766,22 +763,22 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -801,41 +798,39 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -856,12 +851,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -879,14 +876,16 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -906,25 +905,27 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
 ; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    sdiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -954,33 +955,27 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
 ; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
 ; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    and w9, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w11, w12, #0xff
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -1012,41 +1007,43 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -1098,71 +1095,74 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[13]
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[9]
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
-; NONEON-NOSVE-NEXT:    udiv w8, w18, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[14]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w9, w12, w9
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -1247,159 +1247,143 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[10]
-; NONEON-NOSVE-NEXT:    umov w3, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w4, v0.b[12]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[13]
-; NONEON-NOSVE-NEXT:    umov w6, v0.b[14]
-; NONEON-NOSVE-NEXT:    umov w1, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v2.b[0]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[2]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w21, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w22, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w23, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w24, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w25, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w26, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w27, v2.b[10]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
-; NONEON-NOSVE-NEXT:    udiv w11, w11, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w9
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w12, w10
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    udiv w13, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[5]
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[12]
-; NONEON-NOSVE-NEXT:    udiv w12, w14, w12
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[14]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[13]
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[8]
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[9]
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
-; NONEON-NOSVE-NEXT:    udiv w17, w17, w15
-; NONEON-NOSVE-NEXT:    umov w15, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
-; NONEON-NOSVE-NEXT:    udiv w15, w2, w15
-; NONEON-NOSVE-NEXT:    umov w2, v1.b[11]
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
-; NONEON-NOSVE-NEXT:    udiv w3, w4, w3
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
-; NONEON-NOSVE-NEXT:    udiv w4, w5, w4
-; NONEON-NOSVE-NEXT:    umov w5, v1.b[14]
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
-; NONEON-NOSVE-NEXT:    udiv w5, w6, w5
-; NONEON-NOSVE-NEXT:    umov w6, v2.b[1]
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
-; NONEON-NOSVE-NEXT:    udiv w1, w6, w1
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[0]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
-; NONEON-NOSVE-NEXT:    udiv w6, w7, w6
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[2]
-; NONEON-NOSVE-NEXT:    udiv w7, w19, w7
-; NONEON-NOSVE-NEXT:    umov w19, v3.b[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w6
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    udiv w19, w20, w19
-; NONEON-NOSVE-NEXT:    umov w20, v3.b[4]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
-; NONEON-NOSVE-NEXT:    udiv w20, w21, w20
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[5]
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
-; NONEON-NOSVE-NEXT:    udiv w21, w22, w21
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[6]
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w22, w23, w22
-; NONEON-NOSVE-NEXT:    umov w23, v3.b[7]
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
-; NONEON-NOSVE-NEXT:    udiv w23, w24, w23
-; NONEON-NOSVE-NEXT:    umov w24, v3.b[8]
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w24, w25, w24
-; NONEON-NOSVE-NEXT:    umov w25, v3.b[9]
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
-; NONEON-NOSVE-NEXT:    udiv w25, w26, w25
-; NONEON-NOSVE-NEXT:    umov w26, v3.b[10]
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w8, w27, w26
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[12]
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
-; NONEON-NOSVE-NEXT:    umov w8, v3.b[15]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[13]
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.b[15]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[14]
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.b[15]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v2.b[15]
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
-; NONEON-NOSVE-NEXT:    udiv w8, w13, w8
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -1422,18 +1406,18 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -1454,25 +1438,27 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -1503,39 +1489,42 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s2, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -1580,75 +1569,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w17, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w18, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w1, v2.h[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w2, v2.h[4]
-; NONEON-NOSVE-NEXT:    umov w3, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w10, w10, w9
-; NONEON-NOSVE-NEXT:    umov w9, v1.h[2]
-; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[7]
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[1]
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
-; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
-; NONEON-NOSVE-NEXT:    umov w16, v3.h[0]
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
-; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
-; NONEON-NOSVE-NEXT:    umov w17, v3.h[2]
-; NONEON-NOSVE-NEXT:    udiv w17, w18, w17
-; NONEON-NOSVE-NEXT:    umov w18, v3.h[3]
-; NONEON-NOSVE-NEXT:    fmov s4, w16
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
-; NONEON-NOSVE-NEXT:    udiv w18, w1, w18
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[4]
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
-; NONEON-NOSVE-NEXT:    udiv w1, w2, w1
-; NONEON-NOSVE-NEXT:    umov w2, v3.h[5]
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
-; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
-; NONEON-NOSVE-NEXT:    umov w3, v3.h[6]
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w4, w3
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
-; NONEON-NOSVE-NEXT:    udiv w9, w9, w10
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -1669,17 +1662,17 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -1697,22 +1690,22 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -1732,41 +1725,39 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w10, s0
-; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
-; NONEON-NOSVE-NEXT:    fmov w13, s2
-; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
-; NONEON-NOSVE-NEXT:    fmov w9, s1
-; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
-; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
-; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
-; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
-; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
-; NONEON-NOSVE-NEXT:    fmov w12, s3
-; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
-; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
-; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
-; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
-; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w8, w16, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1787,12 +1778,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -1810,14 +1803,16 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -1837,25 +1832,27 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x11, d2
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
-; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
-; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
-; NONEON-NOSVE-NEXT:    fmov x10, d3
-; NONEON-NOSVE-NEXT:    udiv x10, x11, x10
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x11, x12, x11
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1905,23 +1902,66 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ;
 ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
 ; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umull2 v3.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v4.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v5.2d, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v2.2s, v0.2s
-; NONEON-NOSVE-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v5.4s
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    sub v2.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    usra v3.4s, v1.4s, #1
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #1
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v3.4s, #6
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #6
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    umull x8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w8
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #6
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index 9f8511b00c6ed1..2c2b79121ef820 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -30,18 +30,50 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    sbfx w8, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w15, #0, #1
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    sbfx w12, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    stp w12, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
@@ -73,17 +105,21 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #61
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #61
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x10, x10, #0, #3
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx x8, x11, #0, #3
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
@@ -106,13 +142,45 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
@@ -138,20 +206,206 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -177,14 +431,42 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -210,21 +492,75 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -263,36 +599,280 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -325,17 +905,19 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #56
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #56
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x11, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -362,22 +944,57 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -419,37 +1036,109 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -522,69 +1211,367 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    sshll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    sshll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    sshll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #372]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -609,13 +1596,25 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -640,20 +1639,91 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -679,14 +1749,24 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -712,21 +1792,39 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -765,36 +1863,124 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -819,13 +2005,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -850,20 +2040,43 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -888,13 +2101,45 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
@@ -920,20 +2165,206 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -959,14 +2390,42 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -992,21 +2451,75 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -1045,36 +2558,280 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -1104,16 +2861,26 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1140,22 +2907,61 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -1197,37 +3003,129 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -1300,69 +3198,400 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #224
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    ushll v5.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    ushll v6.8h, v1.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v3.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v4.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
-; NONEON-NOSVE-NEXT:    ushll v6.4s, v6.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v7.4s, v7.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
-; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
-; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
-; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v19.2d, v19.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
-; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
-; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
-; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v16.2d, v16.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v22.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v23.2s, #0
-; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v20.2s, #0
-; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v21.2s, #0
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v17.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v18.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
-; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #572]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #564]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #580]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #604]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #620]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #612]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #508]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #500]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #524]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #516]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #540]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #556]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #548]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #700]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #692]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #716]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #708]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #732]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #724]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #748]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #740]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #636]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #628]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #652]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #644]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #668]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #684]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #676]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -1387,13 +3616,25 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
@@ -1418,20 +3659,91 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -1457,14 +3769,26 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1490,21 +3814,43 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -1543,36 +3889,144 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -1597,13 +4051,19 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
@@ -1628,20 +4088,47 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -1672,17 +4159,17 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ;
 ; NONEON-NOSVE-LABEL: extend_and_mul:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v1.2s, w0
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov w9, w0
+; NONEON-NOSVE-NEXT:    mul x10, x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
 ; NONEON-NOSVE-NEXT:    mul x8, x9, x8
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -1702,9 +4189,12 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ;
 ; NONEON-NOSVE-LABEL: extend_no_mul:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    mov w8, w0
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index ade60b07150ce2..1f5bb5f5486af3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -26,11 +26,108 @@ define void @add_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -51,12 +148,60 @@ define void @add_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -77,12 +222,32 @@ define void @add_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -103,12 +268,22 @@ define void @add_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -133,11 +308,108 @@ define void @and_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -158,12 +430,60 @@ define void @and_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -184,12 +504,32 @@ define void @and_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -210,12 +550,22 @@ define void @and_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -240,10 +590,108 @@ define void @ashr_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
@@ -264,10 +712,60 @@ define void @ashr_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -288,10 +786,32 @@ define void @ashr_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -312,10 +832,22 @@ define void @ashr_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -343,11 +875,140 @@ define void @icmp_eq_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmeq v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    cmeq v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -372,12 +1033,76 @@ define void @icmp_sge_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    cmge v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    cmge v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -402,12 +1127,40 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #-8 // =0xfffffff8
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    cmgt v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    cmgt v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
@@ -432,12 +1185,26 @@ define void @icmp_ult_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -463,10 +1230,108 @@ define void @lshr_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    ushr v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -487,10 +1352,60 @@ define void @lshr_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    ushr v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -511,10 +1426,32 @@ define void @lshr_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    ushr v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -535,10 +1472,22 @@ define void @lshr_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ushr v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    ushr v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -563,11 +1512,140 @@ define void @mul_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    mul v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mul v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -588,12 +1666,76 @@ define void @mul_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    mul v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    mul v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -614,12 +1756,44 @@ define void @mul_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    mul v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    mul v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -640,24 +1814,28 @@ define void @mul_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: mul_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    lsl x12, x10, #6
-; NONEON-NOSVE-NEXT:    lsl x13, x11, #6
-; NONEON-NOSVE-NEXT:    lsl x14, x8, #6
-; NONEON-NOSVE-NEXT:    sub x10, x12, x10
-; NONEON-NOSVE-NEXT:    sub x11, x13, x11
-; NONEON-NOSVE-NEXT:    lsl x12, x9, #6
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x11
-; NONEON-NOSVE-NEXT:    sub x8, x14, x8
-; NONEON-NOSVE-NEXT:    sub x9, x12, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x9
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -682,11 +1860,108 @@ define void @or_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -707,12 +1982,60 @@ define void @or_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -733,12 +2056,32 @@ define void @or_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -759,12 +2102,22 @@ define void @or_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -789,10 +2142,108 @@ define void @shl_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -813,10 +2264,60 @@ define void @shl_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.8h, v0.8h, #15
-; NONEON-NOSVE-NEXT:    shl v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -837,10 +2338,32 @@ define void @shl_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
-; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -861,10 +2384,22 @@ define void @shl_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #63
-; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -889,11 +2424,141 @@ define void @smax_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -914,12 +2579,77 @@ define void @smax_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -940,12 +2670,41 @@ define void @smax_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -966,14 +2725,27 @@ define void @smax_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -998,11 +2770,141 @@ define void @smin_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1023,12 +2925,77 @@ define void @smin_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    smin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1049,12 +3016,41 @@ define void @smin_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    smin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1075,14 +3071,27 @@ define void @smin_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmgt v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1107,11 +3116,108 @@ define void @sub_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    sub v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1132,12 +3238,60 @@ define void @sub_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    sub v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1158,12 +3312,32 @@ define void @sub_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1184,12 +3358,22 @@ define void @sub_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sub_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    sub v1.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1214,11 +3398,141 @@ define void @umax_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1239,12 +3553,77 @@ define void @umax_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umax v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1265,12 +3644,41 @@ define void @umax_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umax v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1291,14 +3699,27 @@ define void @umax_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1323,11 +3744,141 @@ define void @umin_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1348,12 +3899,77 @@ define void @umin_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    umin v1.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1374,12 +3990,41 @@ define void @umin_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    umin v1.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1400,14 +4045,27 @@ define void @umin_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
@@ -1432,11 +4090,108 @@ define void @xor_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #7
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
@@ -1457,12 +4212,60 @@ define void @xor_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
@@ -1483,12 +4286,32 @@ define void @xor_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
@@ -1509,12 +4332,22 @@ define void @xor_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 4fc7ec3a8439df..3137a7bc7ad270 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -20,7 +20,43 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -37,7 +73,74 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -55,11 +158,143 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -79,7 +314,27 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -96,7 +351,42 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -114,11 +404,79 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -138,7 +496,18 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -155,7 +524,24 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -173,11 +559,43 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -197,7 +615,14 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -214,7 +639,17 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: and_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -232,11 +667,29 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: and_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -260,7 +713,43 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -277,7 +766,74 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -295,11 +851,143 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -319,7 +1007,27 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -336,7 +1044,42 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -354,11 +1097,79 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -378,7 +1189,18 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -395,7 +1217,24 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -413,11 +1252,43 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -437,7 +1308,14 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -454,7 +1332,17 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: or_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -472,11 +1360,29 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: or_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -500,7 +1406,43 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -517,7 +1459,74 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -535,11 +1544,143 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -559,7 +1700,27 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -576,7 +1737,42 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -594,11 +1790,79 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -618,7 +1882,18 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -635,7 +1910,24 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -653,11 +1945,43 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -677,7 +2001,14 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -694,7 +2025,17 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -712,11 +2053,29 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: xor_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index b9c859a58611e8..4775a965b70d77 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -21,7 +21,51 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -39,7 +83,90 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -59,11 +186,175 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -84,7 +375,31 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -102,7 +417,50 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -122,11 +480,95 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -147,7 +589,19 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -165,7 +619,26 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -185,11 +658,47 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -211,8 +720,15 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -231,8 +747,18 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -252,14 +778,31 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -284,7 +827,51 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -302,7 +889,90 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -322,11 +992,175 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -347,7 +1181,31 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -365,7 +1223,50 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -385,11 +1286,95 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -410,7 +1395,19 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -428,7 +1425,26 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -448,11 +1464,47 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -474,8 +1526,15 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -494,8 +1553,18 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -515,14 +1584,31 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmgt v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -547,7 +1633,51 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -565,7 +1695,90 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -585,11 +1798,175 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -610,7 +1987,31 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -628,7 +2029,50 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -648,11 +2092,95 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -673,7 +2201,19 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -691,7 +2231,26 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -711,11 +2270,47 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -737,8 +2332,15 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -757,8 +2359,18 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -778,14 +2390,31 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -810,7 +2439,51 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
@@ -828,7 +2501,90 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
@@ -848,11 +2604,175 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -873,7 +2793,31 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
@@ -891,7 +2835,50 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
@@ -911,11 +2898,95 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -936,7 +3007,19 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
@@ -954,7 +3037,26 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
@@ -974,11 +3076,47 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1000,8 +3138,15 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
@@ -1020,8 +3165,18 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
@@ -1041,14 +3196,31 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 3a03de3442d581..94d5bb1543b0e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -24,8 +24,51 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ;
 ; NONEON-NOSVE-LABEL: mla8xi8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mla v2.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #7]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #6]
+; NONEON-NOSVE-NEXT:    madd w1, w2, w1, w5
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    madd w1, w4, w3, w1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    madd w18, w0, w18, w1
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    madd w16, w17, w16, w18
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #19]
+; NONEON-NOSVE-NEXT:    madd w14, w15, w14, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    madd w12, w13, w12, w14
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #17]
+; NONEON-NOSVE-NEXT:    madd w10, w11, w10, w12
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    madd w8, w9, w8, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 1ed3d8fa39d8da..6198926c0b4381 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -40,12 +40,31 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w9, w9, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #4, #12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w11, #4, #12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -77,8 +96,51 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -110,9 +172,116 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
@@ -145,15 +314,251 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    smull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    smull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -193,12 +598,20 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
@@ -228,8 +641,31 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
@@ -259,9 +695,54 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
@@ -294,15 +775,125 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    smull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    smull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrsh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrsh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrsh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrsh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrsh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -335,8 +926,18 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smull x9, w9, w10
+; NONEON-NOSVE-NEXT:    smull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
@@ -366,9 +967,28 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldpsw x12, x14, [sp, #56]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w14
+; NONEON-NOSVE-NEXT:    smull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
@@ -401,15 +1021,52 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    smull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    smull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    smull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x12, x13, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldpsw x14, x15, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x17, x16, [sp, #112]
+; NONEON-NOSVE-NEXT:    smull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #120]
+; NONEON-NOSVE-NEXT:    smull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldpsw x17, x1, [sp, #80]
+; NONEON-NOSVE-NEXT:    smull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    smull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #88]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w18
+; NONEON-NOSVE-NEXT:    smull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -442,12 +1099,14 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d0
 ; NONEON-NOSVE-NEXT:    fmov x9, d1
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
@@ -479,15 +1138,17 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
@@ -520,27 +1181,29 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    smulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    smulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -583,11 +1246,31 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #4
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #4
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #4
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
@@ -617,8 +1300,51 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
@@ -648,9 +1374,116 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
@@ -683,15 +1516,251 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.8h, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umull v0.8h, v1.8b, v0.8b
-; NONEON-NOSVE-NEXT:    umull2 v1.8h, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    umull v2.8h, v2.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -730,11 +1799,20 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
@@ -764,8 +1842,31 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
@@ -795,9 +1896,54 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
@@ -830,15 +1976,125 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.4s, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umull v0.4s, v1.4h, v0.4h
-; NONEON-NOSVE-NEXT:    umull2 v1.4s, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    umull v2.4s, v2.4h, v3.4h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -871,8 +2127,18 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x9, w9, w10
+; NONEON-NOSVE-NEXT:    umull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
@@ -902,9 +2168,28 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w13, w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldp w12, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w14
+; NONEON-NOSVE-NEXT:    umull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
@@ -937,15 +2222,52 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    umull2 v4.2d, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umull v0.2d, v1.2s, v0.2s
-; NONEON-NOSVE-NEXT:    umull2 v1.2d, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    umull v2.2d, v2.2s, v3.2s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w17, w16, [sp, #112]
+; NONEON-NOSVE-NEXT:    umull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #120]
+; NONEON-NOSVE-NEXT:    umull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldp w17, w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    umull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    umull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w18
+; NONEON-NOSVE-NEXT:    umull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -980,12 +2302,14 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d0
 ; NONEON-NOSVE-NEXT:    fmov x9, d1
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
@@ -1015,15 +2339,17 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
-; NONEON-NOSVE-NEXT:    fmov x10, d0
-; NONEON-NOSVE-NEXT:    fmov x11, d1
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
@@ -1056,27 +2382,29 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
-; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x12, d3
-; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
-; NONEON-NOSVE-NEXT:    fmov x9, d2
-; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
-; NONEON-NOSVE-NEXT:    umulh x9, x9, x12
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    umulh x11, x13, x14
-; NONEON-NOSVE-NEXT:    fmov d1, x10
-; NONEON-NOSVE-NEXT:    fmov d2, x9
-; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
-; NONEON-NOSVE-NEXT:    fmov d3, x11
-; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index ad75ba62e17cf8..7bdb4599707b0c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -21,8 +21,25 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -40,8 +57,40 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w11, w14, w13
+; NONEON-NOSVE-NEXT:    add w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w12, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    add w9, w10, w9
+; NONEON-NOSVE-NEXT:    add w10, w12, w16
+; NONEON-NOSVE-NEXT:    add w8, w8, w15
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w13
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -61,9 +110,72 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    addv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w11, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w14, w15, w14
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w9, w9, w14
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w16, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    add w11, w17, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -82,8 +194,17 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -101,8 +222,24 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -122,9 +259,40 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w13, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    add w11, w15, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w13, w12
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -143,8 +311,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -162,8 +334,13 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -183,9 +360,20 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    addv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w9, w11, w9
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w14, w12
+; NONEON-NOSVE-NEXT:    add w11, w15, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -203,8 +391,10 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -223,9 +413,13 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: uaddv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    addp d0, v0.2d
-; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    add x9, x11, x9
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
@@ -247,8 +441,32 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -265,8 +483,55 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -285,9 +550,103 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
@@ -305,8 +664,20 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -323,8 +694,31 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -343,9 +737,55 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
@@ -363,8 +803,13 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -381,8 +826,17 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: smaxv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -401,9 +855,27 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
@@ -424,11 +896,9 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -447,15 +917,17 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: smaxv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, gt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, gt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
@@ -477,8 +949,32 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -495,8 +991,55 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -515,9 +1058,103 @@ define i8 @sminv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
@@ -535,8 +1172,20 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -553,8 +1202,31 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -573,9 +1245,55 @@ define i16 @sminv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
@@ -593,8 +1311,13 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -611,8 +1334,17 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -631,9 +1363,27 @@ define i32 @sminv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: sminv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
@@ -654,11 +1404,9 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -676,16 +1424,18 @@ define i64 @sminv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sminv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
@@ -707,8 +1457,32 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -725,8 +1499,55 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -745,9 +1566,103 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
@@ -765,8 +1680,20 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -783,8 +1710,31 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -803,9 +1753,55 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
@@ -823,8 +1819,13 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -841,8 +1842,17 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: umaxv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -861,9 +1871,27 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
@@ -884,11 +1912,9 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -907,15 +1933,17 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: umaxv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, hi
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
@@ -937,8 +1965,32 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -955,8 +2007,55 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
@@ -975,9 +2074,103 @@ define i8 @uminv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
@@ -995,8 +2188,20 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -1013,8 +2218,31 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
@@ -1033,9 +2261,55 @@ define i16 @uminv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
@@ -1053,8 +2327,13 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -1071,8 +2350,17 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -1091,9 +2379,27 @@ define i32 @uminv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: uminv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
-; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
@@ -1114,11 +2420,9 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -1136,16 +2440,18 @@ define i64 @uminv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: uminv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lo
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 99f8aef9f2b22d..cb1fb20ec9d8d7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -28,31 +28,31 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #10]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -86,49 +86,51 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -182,108 +184,90 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    smov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    smov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    smov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    smov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    smov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    smov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    smov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    smov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    smov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    smov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -375,275 +359,175 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    smov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    smov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    smov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    smov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    smov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    smov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    smov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    smov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    smov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    smov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    smov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    smov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    smov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    smov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    smov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    smov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    sdiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    sdiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    smov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    sdiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    sdiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    sdiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    smov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    smov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    smov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    smov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    smov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    sdiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    smov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    smov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    sdiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    smov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    smov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    smov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    smov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -669,29 +553,31 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -724,47 +610,50 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    smov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    smov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    smov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    smov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -813,135 +702,95 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    smov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    smov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    smov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    smov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    smov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    smov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    smov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    smov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    smov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    smov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    smov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    smov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    smov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    smov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    smov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    smov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    smov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    smov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    smov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    sdiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    smov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    sdiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    sdiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    sdiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    smov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    sdiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    smov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    smov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    sdiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    sdiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    smov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    sdiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    sdiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    sdiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    sdiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -964,19 +813,20 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -996,26 +846,28 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
 ; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -1039,61 +891,50 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    sdiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    sdiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    sdiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    sdiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    sdiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -1116,13 +957,15 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -1142,16 +985,19 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -1175,29 +1021,33 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    sdiv x16, x15, x14
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    sdiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -1229,37 +1079,31 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
-; NONEON-NOSVE-NEXT:    and w12, w12, #0xff
-; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
 ; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
-; NONEON-NOSVE-NEXT:    and w14, w14, #0xff
-; NONEON-NOSVE-NEXT:    and w15, w15, #0xff
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    and w12, w17, #0xff
-; NONEON-NOSVE-NEXT:    and w13, w18, #0xff
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w12, w13
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -1293,49 +1137,51 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[6]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[7]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -1389,108 +1235,90 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    umov w7, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w19, v0.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v1.b[7]
-; NONEON-NOSVE-NEXT:    umov w22, v0.b[7]
-; NONEON-NOSVE-NEXT:    umov w24, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w25, v0.b[8]
-; NONEON-NOSVE-NEXT:    umov w27, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w28, v0.b[9]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[11]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[10]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[10]
-; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[11]
-; NONEON-NOSVE-NEXT:    umov w16, v1.b[12]
-; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    umov w17, v0.b[12]
-; NONEON-NOSVE-NEXT:    umov w0, v1.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
-; NONEON-NOSVE-NEXT:    udiv w6, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    umov w1, v0.b[13]
-; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
-; NONEON-NOSVE-NEXT:    udiv w20, w19, w7
-; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
-; NONEON-NOSVE-NEXT:    udiv w23, w22, w21
-; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
-; NONEON-NOSVE-NEXT:    udiv w26, w25, w24
-; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w28, w27
-; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
-; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    umov w10, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w11, v0.b[14]
-; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
-; NONEON-NOSVE-NEXT:    umov w13, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w14, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w1, w0
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
-; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -1582,275 +1410,175 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #320
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
-; NONEON-NOSVE-NEXT:    umov w4, v3.b[1]
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[1]
-; NONEON-NOSVE-NEXT:    umov w7, v3.b[7]
-; NONEON-NOSVE-NEXT:    umov w5, v2.b[7]
-; NONEON-NOSVE-NEXT:    umov w6, v3.b[8]
-; NONEON-NOSVE-NEXT:    umov w3, v2.b[8]
-; NONEON-NOSVE-NEXT:    umov w22, v3.b[9]
-; NONEON-NOSVE-NEXT:    umov w20, v2.b[9]
-; NONEON-NOSVE-NEXT:    umov w13, v3.b[0]
-; NONEON-NOSVE-NEXT:    umov w17, v3.b[3]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[0]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[0]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[3]
-; NONEON-NOSVE-NEXT:    umov w15, v3.b[4]
-; NONEON-NOSVE-NEXT:    umov w12, v2.b[4]
-; NONEON-NOSVE-NEXT:    umov w2, v3.b[5]
-; NONEON-NOSVE-NEXT:    umov w18, v2.b[5]
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[6]
-; NONEON-NOSVE-NEXT:    umov w16, v2.b[6]
-; NONEON-NOSVE-NEXT:    umov w21, v3.b[10]
-; NONEON-NOSVE-NEXT:    umov w19, v2.b[10]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[2]
-; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[3]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[3]
-; NONEON-NOSVE-NEXT:    udiv w26, w14, w17
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[4]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[4]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[5]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[5]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[6]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[6]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[7]
-; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[7]
-; NONEON-NOSVE-NEXT:    udiv w25, w12, w15
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[8]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[9]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[9]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[10]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[10]
-; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[11]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[11]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[12]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[12]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[13]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[13]
-; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v3.b[2]
-; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.b[14]
-; NONEON-NOSVE-NEXT:    umov w9, v0.b[14]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w9, v2.b[2]
-; NONEON-NOSVE-NEXT:    udiv w8, w1, w4
-; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v2.b[0]
-; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w5, w7
-; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w3, w6
-; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w8, w20, w22
-; NONEON-NOSVE-NEXT:    udiv w24, w10, w13
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w8
-; NONEON-NOSVE-NEXT:    udiv w23, w9, w11
-; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s5, w10
-; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
-; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
-; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
-; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w28, w18, w2
-; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    umov w10, v3.b[11]
-; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
-; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
-; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
-; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w27, w16, w0
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
-; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
-; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
-; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
-; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
-; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
-; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w4, w19, w21
-; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
-; NONEON-NOSVE-NEXT:    umov w12, v3.b[12]
-; NONEON-NOSVE-NEXT:    umov w14, v2.b[12]
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
-; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
-; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w13, w11, w10
-; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
-; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
-; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
-; NONEON-NOSVE-NEXT:    umov w16, v3.b[13]
-; NONEON-NOSVE-NEXT:    umov w17, v2.b[13]
-; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
-; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
-; NONEON-NOSVE-NEXT:    udiv w15, w14, w12
-; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
-; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
-; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
-; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
-; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
-; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
-; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
-; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
-; NONEON-NOSVE-NEXT:    umov w0, v3.b[14]
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
-; NONEON-NOSVE-NEXT:    umov w1, v2.b[14]
-; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
-; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
-; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
-; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
-; NONEON-NOSVE-NEXT:    udiv w2, w1, w0
-; NONEON-NOSVE-NEXT:    umov w9, v3.b[15]
-; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
-; NONEON-NOSVE-NEXT:    umov w4, v2.b[15]
-; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
-; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
-; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w11, w4, w9
-; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
-; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
-; NONEON-NOSVE-NEXT:    umov w12, v1.b[15]
-; NONEON-NOSVE-NEXT:    umov w13, v0.b[15]
-; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
-; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w14, w13, w12
-; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
-; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
-; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
-; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
-; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
-; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
-; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -1876,29 +1604,31 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -1931,47 +1661,50 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
-; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w1, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[5]
-; NONEON-NOSVE-NEXT:    umov w5, v0.h[5]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    fmov s2, w11
-; NONEON-NOSVE-NEXT:    umov w11, v0.h[6]
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    umov w10, v1.h[6]
-; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
-; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    umov w14, v0.h[7]
-; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
-; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
-; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
-; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
-; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
-; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
-; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
-; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
-; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
-; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
-; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -2020,135 +1753,95 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
-; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
-; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
-; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
-; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
-; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
-; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
-; NONEON-NOSVE-NEXT:    umov w20, v1.h[0]
-; NONEON-NOSVE-NEXT:    umov w21, v0.h[0]
-; NONEON-NOSVE-NEXT:    umov w19, v0.h[3]
-; NONEON-NOSVE-NEXT:    umov w5, v1.h[4]
-; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
-; NONEON-NOSVE-NEXT:    umov w1, v3.h[1]
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[1]
-; NONEON-NOSVE-NEXT:    umov w25, v3.h[0]
-; NONEON-NOSVE-NEXT:    umov w26, v2.h[0]
-; NONEON-NOSVE-NEXT:    umov w6, v1.h[5]
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w3, v0.h[5]
-; NONEON-NOSVE-NEXT:    umov w4, v1.h[6]
-; NONEON-NOSVE-NEXT:    umov w7, v0.h[6]
-; NONEON-NOSVE-NEXT:    umov w28, v3.h[2]
-; NONEON-NOSVE-NEXT:    umov w29, v2.h[2]
-; NONEON-NOSVE-NEXT:    umov w15, v3.h[3]
-; NONEON-NOSVE-NEXT:    umov w13, v2.h[3]
-; NONEON-NOSVE-NEXT:    umov w12, v3.h[4]
-; NONEON-NOSVE-NEXT:    umov w14, v3.h[5]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w11, w21, w20
-; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    umov w8, v1.h[3]
-; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w11, v2.h[4]
-; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
-; NONEON-NOSVE-NEXT:    udiv w9, w19, w8
-; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w10, v3.h[6]
-; NONEON-NOSVE-NEXT:    fmov s5, w20
-; NONEON-NOSVE-NEXT:    umov w20, v3.h[7]
-; NONEON-NOSVE-NEXT:    udiv w8, w2, w5
-; NONEON-NOSVE-NEXT:    udiv w24, w23, w1
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    udiv w27, w26, w25
-; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
-; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w9, w3, w6
-; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
-; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    fmov s4, w21
-; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
-; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
-; NONEON-NOSVE-NEXT:    udiv w8, w7, w4
-; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
-; NONEON-NOSVE-NEXT:    umov w23, v2.h[7]
-; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    udiv w30, w29, w28
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    umov w9, v2.h[5]
-; NONEON-NOSVE-NEXT:    umov w8, v2.h[6]
-; NONEON-NOSVE-NEXT:    udiv w18, w13, w15
-; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
-; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
-; NONEON-NOSVE-NEXT:    udiv w16, w11, w12
-; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
-; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
-; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
-; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
-; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
-; NONEON-NOSVE-NEXT:    umov w15, v0.h[7]
-; NONEON-NOSVE-NEXT:    udiv w17, w9, w14
-; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
-; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
-; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
-; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
-; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
-; NONEON-NOSVE-NEXT:    udiv w24, w8, w10
-; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
-; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
-; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
-; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
-; NONEON-NOSVE-NEXT:    udiv w18, w23, w20
-; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
-; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
-; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
-; NONEON-NOSVE-NEXT:    udiv w12, w15, w13
-; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
-; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
-; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
-; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -2171,19 +1864,20 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -2203,26 +1897,28 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
 ; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s0, w11
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
 ; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
-; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -2246,61 +1942,50 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
-; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
-; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
-; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
-; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov w12, s0
-; NONEON-NOSVE-NEXT:    fmov w3, s2
-; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w11, s1
-; NONEON-NOSVE-NEXT:    fmov w2, s3
-; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
-; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
-; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
-; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
-; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
-; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
-; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
-; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
-; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
-; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
-; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
-; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
-; NONEON-NOSVE-NEXT:    udiv w4, w3, w2
-; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
-; NONEON-NOSVE-NEXT:    fmov s1, w11
-; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
-; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
-; NONEON-NOSVE-NEXT:    fmov s0, w12
-; NONEON-NOSVE-NEXT:    udiv w1, w18, w17
-; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
-; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
-; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
-; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
-; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
-; NONEON-NOSVE-NEXT:    udiv w7, w6, w5
-; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
-; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
-; NONEON-NOSVE-NEXT:    udiv w21, w20, w19
-; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
-; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
-; NONEON-NOSVE-NEXT:    udiv w9, w23, w22
-; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
-; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
-; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
-; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -2323,13 +2008,15 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    fmov x8, d1
 ; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -2349,16 +2036,19 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -2382,29 +2072,33 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    fmov x15, d2
-; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
-; NONEON-NOSVE-NEXT:    fmov x8, d1
-; NONEON-NOSVE-NEXT:    fmov x14, d3
-; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
-; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
-; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
-; NONEON-NOSVE-NEXT:    udiv x16, x15, x14
 ; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
-; NONEON-NOSVE-NEXT:    fmov d1, x8
-; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
-; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
-; NONEON-NOSVE-NEXT:    fmov d0, x10
-; NONEON-NOSVE-NEXT:    udiv x1, x18, x17
-; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
-; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
-; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
-; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 0108fb580b947b..5cee1360f6f3cf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -20,10 +20,28 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
@@ -43,10 +61,44 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8b, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
@@ -66,10 +118,75 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.16b, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
@@ -92,16 +209,147 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.16b, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
@@ -125,10 +373,18 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
@@ -149,10 +405,28 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4h, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
@@ -173,10 +447,43 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.8h, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
@@ -200,16 +507,83 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
@@ -233,10 +607,18 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
@@ -257,10 +639,23 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    dup v2.4s, w8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
@@ -284,16 +679,43 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm w8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
@@ -318,10 +740,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
@@ -343,10 +770,17 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    dup v2.2d, x8
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
@@ -371,16 +805,31 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tst w2, #0x1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    ldr q3, [x1]
-; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
-; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index f7198e3042ad53..2778e93416a748 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -23,12 +23,27 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w10, w11, w10
+; NONEON-NOSVE-NEXT:    asr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -46,8 +61,43 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -65,8 +115,74 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -86,13 +202,143 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    sshl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    sshl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -115,12 +361,18 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    asr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -138,8 +390,27 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -157,8 +428,42 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -178,13 +483,79 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    sshl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    sshl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -205,8 +576,17 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -224,8 +604,22 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -245,13 +639,39 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -272,8 +692,14 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    sshl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -291,8 +717,16 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -312,13 +746,27 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -345,11 +793,27 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w10, w11, w10
+; NONEON-NOSVE-NEXT:    lsr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -367,8 +831,43 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -386,8 +885,74 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -407,13 +972,143 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v2.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -436,11 +1131,18 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
@@ -458,8 +1160,27 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -477,8 +1198,42 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -498,13 +1253,79 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v2.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -525,8 +1346,17 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -544,8 +1374,22 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -565,13 +1409,39 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -592,8 +1462,14 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg d1, d1
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -611,8 +1487,16 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -632,13 +1516,27 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v2.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -664,9 +1562,18 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0x0000ff000000ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
@@ -685,9 +1592,27 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w10, w9
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
@@ -705,7 +1630,43 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
@@ -723,7 +1684,74 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
@@ -743,11 +1771,143 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ushl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -768,7 +1928,27 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
@@ -786,7 +1966,42 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
@@ -806,11 +2021,79 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ushl v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -831,7 +2114,17 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
@@ -849,7 +2142,22 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
@@ -869,11 +2177,39 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushl v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -894,7 +2230,14 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
@@ -912,7 +2255,16 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
@@ -932,11 +2284,27 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    ushl v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushl v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 42d3b9d8f71f86..fd2d9a8fb80d17 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -19,9 +19,26 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
@@ -39,17 +56,43 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
@@ -69,25 +112,76 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
@@ -111,9 +205,15 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
@@ -131,8 +231,21 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
@@ -154,15 +267,33 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
@@ -192,21 +323,57 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
@@ -229,9 +396,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
@@ -250,10 +421,16 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
@@ -275,17 +452,31 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
@@ -318,26 +509,53 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
@@ -390,42 +608,99 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    ucvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
@@ -449,9 +724,18 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
@@ -469,8 +753,24 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
@@ -492,11 +792,39 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
@@ -525,17 +853,72 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
@@ -558,7 +941,14 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
@@ -575,7 +965,18 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
@@ -593,10 +994,28 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
@@ -620,8 +1039,16 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
@@ -643,15 +1070,23 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
@@ -681,21 +1116,37 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
@@ -726,14 +1177,17 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ucvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
@@ -758,12 +1212,25 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
@@ -801,18 +1268,43 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v2.2s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.4s, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
@@ -835,8 +1327,14 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -858,11 +1356,19 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
@@ -891,17 +1397,32 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v2.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
@@ -924,7 +1445,14 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
@@ -942,10 +1470,20 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
@@ -968,9 +1506,26 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
@@ -988,17 +1543,43 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
@@ -1018,25 +1599,76 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
@@ -1059,9 +1691,15 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1079,8 +1717,21 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1102,15 +1753,33 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
@@ -1140,21 +1809,57 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
@@ -1180,10 +1885,16 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1205,17 +1916,29 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
@@ -1248,26 +1971,49 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
@@ -1320,42 +2066,92 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
-; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
-; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
@@ -1379,9 +2175,18 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
@@ -1399,8 +2204,24 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1422,11 +2243,39 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
@@ -1448,7 +2297,14 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1465,7 +2321,18 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1483,10 +2350,28 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
@@ -1510,8 +2395,15 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1533,15 +2425,21 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
@@ -1571,21 +2469,33 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
@@ -1634,36 +2544,68 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-64]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
-; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
-; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
-; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
-; NONEON-NOSVE-NEXT:    ldr d7, [sp, #8]
-; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
-; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
-; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
-; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
-; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    scvtf v2.2d, v6.2d
-; NONEON-NOSVE-NEXT:    stp q3, q5, [x1, #64]
-; NONEON-NOSVE-NEXT:    scvtf v3.2d, v7.2d
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d2, w9
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf d0, w9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp q4, q6, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q7, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #160]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr q5, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
@@ -1694,14 +2636,17 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
-; NONEON-NOSVE-NEXT:    fmov x9, d0
-; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    scvtf s0, x8
-; NONEON-NOSVE-NEXT:    fcvt h2, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s1
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
@@ -1726,12 +2671,25 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
@@ -1754,8 +2712,14 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1777,11 +2741,19 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
-; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
-; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
@@ -1803,7 +2775,14 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
@@ -1821,10 +2800,20 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 250929df6b3c35..af15d5f67ad15c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -22,9 +22,40 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
@@ -47,9 +78,68 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.8b, v2.8b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.8b, v2.8b, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
@@ -72,9 +162,124 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ;
 ; NONEON-NOSVE-LABEL: select_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.16b, v2.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v2.16b, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w2, w2, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w4, w4, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w3, w3, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w2, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w1, w1, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w2, w6, w5, ne
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w4, #0xff
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w0, w0, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w18, w18, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w17, w17, #0, #1
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, ne
+; NONEON-NOSVE-NEXT:    tst w3, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, ne
+; NONEON-NOSVE-NEXT:    tst w1, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w16, w16, #0, #1
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #28]
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #36]
+; NONEON-NOSVE-NEXT:    csel w1, w3, w2, ne
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w0, #0xff
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #27]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csel w0, w2, w1, ne
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w18, #0xff
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #26]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w18, w1, w0, ne
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w17, #0xff
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #25]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w17, w0, w18, ne
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w16, #0xff
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, ne
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w15, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w15, ne
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w14, ne
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
@@ -95,14 +300,204 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    cmeq v5.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #4] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w12, w1, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w2, w13
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w13, w2, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w16, w1, w18, eq
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #42]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    csel w18, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w6, w1
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #45]
+; NONEON-NOSVE-NEXT:    csel w1, w6, w1, eq
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #29]
+; NONEON-NOSVE-NEXT:    str w8, [sp] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w19, w6
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    csel w5, w19, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w6, w30, w29, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #49]
+; NONEON-NOSVE-NEXT:    csel w19, w8, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w21
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    csel w21, w10, w21, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w22
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    csel w22, w11, w22, eq
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w11, w29, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w27
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #53]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w27, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w26
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #54]
+; NONEON-NOSVE-NEXT:    csel w9, w9, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w25
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #55]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w25, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w24
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w24, w28, w24, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w23
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #57]
+; NONEON-NOSVE-NEXT:    csel w23, w27, w23, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w20
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #58]
+; NONEON-NOSVE-NEXT:    csel w20, w26, w20, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w7
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #59]
+; NONEON-NOSVE-NEXT:    csel w7, w25, w7, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w4
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #60]
+; NONEON-NOSVE-NEXT:    csel w4, w28, w4, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w3
+; NONEON-NOSVE-NEXT:    csel w3, w27, w3, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w17
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #61]
+; NONEON-NOSVE-NEXT:    csel w17, w26, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w15
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #78]
+; NONEON-NOSVE-NEXT:    csel w15, w25, w15, eq
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w27, w28
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    csel w27, w27, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    csel w25, w25, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    csel w26, w30, w29, eq
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w26, [sp, #111]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w25, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w27, [sp, #109]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    csel w8, w29, w28, eq
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #106]
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #105]
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #103]
+; NONEON-NOSVE-NEXT:    strb w23, [sp, #102]
+; NONEON-NOSVE-NEXT:    strb w24, [sp, #101]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #100]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #97]
+; NONEON-NOSVE-NEXT:    strb w22, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #87]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #85]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #80]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -129,9 +524,25 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
@@ -154,9 +565,40 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
@@ -180,10 +622,68 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
-; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w15, #0xffff
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
@@ -204,14 +704,98 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    cmeq v5.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w14
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w14, w15, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w17, w16
+; NONEON-NOSVE-NEXT:    csel w16, w17, w16, eq
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w17
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w17, w1, w17, eq
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w4, w3
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w3, w4, w3, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    cmp w5, w1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #38]
+; NONEON-NOSVE-NEXT:    csel w1, w5, w1, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w6
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w6, w7, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w2
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #42]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, eq
+; NONEON-NOSVE-NEXT:    cmp w19, w13
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    csel w13, w19, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w5, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    csel w18, w5, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w15
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    csel w15, w7, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    csel w11, w4, w11, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w19, w10
+; NONEON-NOSVE-NEXT:    csel w10, w19, w10, eq
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    cmp w5, w4
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #94]
+; NONEON-NOSVE-NEXT:    csel w8, w5, w4, eq
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #90]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w3, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w17, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -238,9 +822,25 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
@@ -264,10 +864,40 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
-; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
@@ -288,14 +918,43 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    cmeq v5.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldp w12, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldp w15, w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w14, w11
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w11, w14, w11, eq
+; NONEON-NOSVE-NEXT:    ldp w17, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w18, w1, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w10, w15
+; NONEON-NOSVE-NEXT:    stp w12, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w16
+; NONEON-NOSVE-NEXT:    ldr w15, [sp]
+; NONEON-NOSVE-NEXT:    csel w13, w13, w16, eq
+; NONEON-NOSVE-NEXT:    cmp w18, w17
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w14
+; NONEON-NOSVE-NEXT:    stp w10, w13, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w10, w1, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w8
+; NONEON-NOSVE-NEXT:    csel w8, w15, w8, eq
+; NONEON-NOSVE-NEXT:    stp w16, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -321,10 +980,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
 ; NONEON-NOSVE-NEXT:    tst w0, #0x1
-; NONEON-NOSVE-NEXT:    csetm x8, ne
-; NONEON-NOSVE-NEXT:    fmov d2, x8
-; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
@@ -348,10 +1012,25 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: select_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
-; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp, #8]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    csel x8, x11, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x10, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x10, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
@@ -372,14 +1051,30 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: select_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    cmeq v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    cmeq v5.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
-; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x10, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp]
+; NONEON-NOSVE-NEXT:    cmp x13, x12
+; NONEON-NOSVE-NEXT:    csel x12, x13, x12, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    stp x9, x12, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 0b6152340f65ab..66d544d0acbf56 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -33,19 +33,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #12
+; NONEON-NOSVE-NEXT:    add x0, sp, #28
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[0]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    strb w8, [x19, #1]
 ; NONEON-NOSVE-NEXT:    strb w9, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [4 x i8]
   call void @def(ptr %alloc)
@@ -88,21 +92,25 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v6i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #8
+; NONEON-NOSVE-NEXT:    add x0, sp, #24
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x9, x19, #2
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    xtn v1.8b, v1.8h
-; NONEON-NOSVE-NEXT:    str s1, [sp, #4]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    st1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    strh w8, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #2]
+; NONEON-NOSVE-NEXT:    strh w9, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [6 x i8]
   call void @def(ptr %alloc)
@@ -135,18 +143,38 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #48
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #64
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    add x8, x19, #8
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    st1 { v1.b }[0], [x8]
-; NONEON-NOSVE-NEXT:    str d0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [32 x i8]
   call void @def(ptr %alloc)
@@ -179,18 +207,26 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #160] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #96
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp]
-; NONEON-NOSVE-NEXT:    zip1 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [8 x double]
   call void @def(ptr %alloc)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 42c439ca4b38d4..3b83f982b6bfc5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -22,15 +22,68 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ;
 ; NONEON-NOSVE-LABEL: test:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v5.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    dup v0.4s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q5, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
@@ -59,15 +112,71 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ;
 ; NONEON-NOSVE-LABEL: test2:
 ; NONEON-NOSVE:       // %bb.0: // %entry
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
-; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    dup v0.2s, v1.s[2]
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 992b667a2eafe1..c97a3c2e721a3d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -15,9 +15,18 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
@@ -75,11 +84,14 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
@@ -93,7 +105,12 @@ define <2 x half> @load_v2f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: load_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index 7abe73f08dfd65..9e1edb817c459a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -21,10 +21,17 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -41,11 +48,25 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -64,13 +85,37 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w11, w14, w13
+; NONEON-NOSVE-NEXT:    and w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w12, w14
+; NONEON-NOSVE-NEXT:    and w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    and w10, w12, w16
+; NONEON-NOSVE-NEXT:    and w8, w8, w15
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
@@ -90,17 +135,72 @@ define i8 @andv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w11, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w14, w15, w14
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w9, w9, w14
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w10, w10, w12
+; NONEON-NOSVE-NEXT:    and w11, w16, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    and w11, w17, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
@@ -118,9 +218,12 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -137,10 +240,17 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -159,11 +269,20 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -184,16 +303,40 @@ define i16 @andv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w13, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w9, w12, w13
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    and w11, w15, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w13, w12
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
@@ -211,9 +354,12 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: andv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -232,12 +378,11 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -256,15 +401,20 @@ define i32 @andv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    and w9, w11, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w14, w12
+; NONEON-NOSVE-NEXT:    and w11, w15, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
 ; NONEON-NOSVE-NEXT:    and w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
@@ -284,10 +434,8 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -306,13 +454,13 @@ define i64 @andv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: andv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    and x9, x11, x9
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
@@ -334,10 +482,17 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -354,11 +509,25 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -377,13 +546,37 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w11, w14, w13
+; NONEON-NOSVE-NEXT:    eor w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w14
+; NONEON-NOSVE-NEXT:    eor w8, w8, w11
+; NONEON-NOSVE-NEXT:    eor w9, w10, w9
+; NONEON-NOSVE-NEXT:    eor w10, w12, w16
+; NONEON-NOSVE-NEXT:    eor w8, w8, w15
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w13
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
@@ -403,17 +596,72 @@ define i8 @eorv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w11, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w14, w15, w14
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w14
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w10, w10, w12
+; NONEON-NOSVE-NEXT:    eor w11, w16, w11
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    eor w11, w17, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
@@ -431,9 +679,12 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -450,10 +701,17 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -472,11 +730,20 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -497,16 +764,40 @@ define i16 @eorv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w13, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w9, w12, w13
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    eor w11, w15, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w13, w12
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
@@ -524,9 +815,12 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: eorv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -545,12 +839,11 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -569,15 +862,20 @@ define i32 @eorv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    eor w9, w11, w9
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w14, w12
+; NONEON-NOSVE-NEXT:    eor w11, w15, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
 ; NONEON-NOSVE-NEXT:    eor w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
@@ -597,10 +895,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -619,13 +915,13 @@ define i64 @eorv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: eorv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    eor x9, x11, x9
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
@@ -647,10 +943,17 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
@@ -667,11 +970,25 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
@@ -690,13 +1007,37 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w11, w14, w13
+; NONEON-NOSVE-NEXT:    orr w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w14
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    orr w10, w12, w16
+; NONEON-NOSVE-NEXT:    orr w8, w8, w15
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
@@ -716,17 +1057,72 @@ define i8 @orv_v32i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w11, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w14, w15, w14
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w14
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w10, w10, w12
+; NONEON-NOSVE-NEXT:    orr w11, w16, w11
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w11, w17, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
@@ -744,9 +1140,12 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
@@ -763,10 +1162,17 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
@@ -785,11 +1191,20 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -810,16 +1225,40 @@ define i16 @orv_v16i16(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w13, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w9, w12, w13
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    orr w11, w15, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w13, w12
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
@@ -837,9 +1276,12 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ;
 ; NONEON-NOSVE-LABEL: orv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
@@ -858,12 +1300,11 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
-; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
@@ -882,15 +1323,20 @@ define i32 @orv_v8i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x8, d0
-; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w14, w12
+; NONEON-NOSVE-NEXT:    orr w11, w15, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
 ; NONEON-NOSVE-NEXT:    orr w0, w8, w9
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
@@ -910,10 +1356,8 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
@@ -932,13 +1376,13 @@ define i64 @orv_v4i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: orv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
-; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    fmov x0, d0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    orr x9, x11, x9
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 6c33613f8e757d..be335c697707de 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -23,40 +23,83 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[0], [x0]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #110]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
 ; NONEON-NOSVE-NEXT:    b .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI0_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
 ; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
 ; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
 ; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
@@ -76,64 +119,183 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB1_2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    add x9, sp, #176
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #243]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #240]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #247]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB1_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #239]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #61]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #57]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #232]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
 ; NONEON-NOSVE-NEXT:    b .LBB1_4
 ; NONEON-NOSVE-NEXT:  .LBB1_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x10, :lo12:.LCPI1_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
 ; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #222]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #34]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #42]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #217]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #216]
 ; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_12
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_13
 ; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_14
 ; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_15
 ; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_16
 ; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
-; NONEON-NOSVE-NEXT:  .LBB1_10: // %else20
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #7]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #183]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #191]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #3]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #11]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #184]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
-; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #155]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #152]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
-; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #119]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #127]
+; NONEON-NOSVE-NEXT:    ldurh w9, [sp, #117]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    sturh w9, [sp, #125]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #120]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
-; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #5]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
-; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_10
-; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_10
+; NONEON-NOSVE-NEXT:    b .LBB1_11
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -152,112 +314,413 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h1, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
-; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
-; NONEON-NOSVE-NEXT:  .LBB2_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
-; NONEON-NOSVE-NEXT:  .LBB2_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
-; NONEON-NOSVE-NEXT:  .LBB2_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
-; NONEON-NOSVE-NEXT:  .LBB2_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
-; NONEON-NOSVE-NEXT:  .LBB2_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
-; NONEON-NOSVE-NEXT:  .LBB2_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
-; NONEON-NOSVE-NEXT:  .LBB2_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
-; NONEON-NOSVE-NEXT:  .LBB2_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
-; NONEON-NOSVE-NEXT:  .LBB2_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
-; NONEON-NOSVE-NEXT:  .LBB2_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
-; NONEON-NOSVE-NEXT:  .LBB2_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
-; NONEON-NOSVE-NEXT:  .LBB2_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
-; NONEON-NOSVE-NEXT:  .LBB2_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
-; NONEON-NOSVE-NEXT:  .LBB2_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
-; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
-; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
-; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
-; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
-; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
-; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
-; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
-; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
-; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
-; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
-; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
-; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
-; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
-; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
-; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB2_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #975]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_3
+; NONEON-NOSVE-NEXT:    b .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_17
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_18
+; NONEON-NOSVE-NEXT:    b .LBB2_19
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -342,274 +805,815 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
-; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
-; NONEON-NOSVE-NEXT:  .LBB3_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
-; NONEON-NOSVE-NEXT:  .LBB3_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
-; NONEON-NOSVE-NEXT:  .LBB3_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
-; NONEON-NOSVE-NEXT:  .LBB3_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
-; NONEON-NOSVE-NEXT:  .LBB3_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
-; NONEON-NOSVE-NEXT:  .LBB3_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
-; NONEON-NOSVE-NEXT:  .LBB3_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
-; NONEON-NOSVE-NEXT:  .LBB3_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
-; NONEON-NOSVE-NEXT:  .LBB3_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
-; NONEON-NOSVE-NEXT:  .LBB3_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
-; NONEON-NOSVE-NEXT:  .LBB3_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
-; NONEON-NOSVE-NEXT:  .LBB3_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
-; NONEON-NOSVE-NEXT:  .LBB3_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
-; NONEON-NOSVE-NEXT:  .LBB3_16: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
-; NONEON-NOSVE-NEXT:  .LBB3_17: // %else47
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
-; NONEON-NOSVE-NEXT:  .LBB3_18: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
-; NONEON-NOSVE-NEXT:  .LBB3_19: // %else53
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
-; NONEON-NOSVE-NEXT:  .LBB3_20: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
-; NONEON-NOSVE-NEXT:  .LBB3_21: // %else59
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
-; NONEON-NOSVE-NEXT:  .LBB3_22: // %else62
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
-; NONEON-NOSVE-NEXT:  .LBB3_23: // %else65
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
-; NONEON-NOSVE-NEXT:  .LBB3_24: // %else68
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
-; NONEON-NOSVE-NEXT:  .LBB3_25: // %else71
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
-; NONEON-NOSVE-NEXT:  .LBB3_26: // %else74
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
-; NONEON-NOSVE-NEXT:  .LBB3_27: // %else77
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
-; NONEON-NOSVE-NEXT:  .LBB3_28: // %else80
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
-; NONEON-NOSVE-NEXT:  .LBB3_29: // %else83
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
-; NONEON-NOSVE-NEXT:  .LBB3_30: // %else86
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
-; NONEON-NOSVE-NEXT:  .LBB3_31: // %else89
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else92
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr b0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #1
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #3
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #5
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #7
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #9
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #11
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #13
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x9, x0, #15
-; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load46
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load49
-; NONEON-NOSVE-NEXT:    add x9, x0, #17
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load52
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load55
-; NONEON-NOSVE-NEXT:    add x9, x0, #19
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load58
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load61
-; NONEON-NOSVE-NEXT:    add x9, x0, #21
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load64
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load67
-; NONEON-NOSVE-NEXT:    add x9, x0, #23
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load70
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[8], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load73
-; NONEON-NOSVE-NEXT:    add x9, x0, #25
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[9], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load76
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[10], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load79
-; NONEON-NOSVE-NEXT:    add x9, x0, #27
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[11], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load82
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[12], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load85
-; NONEON-NOSVE-NEXT:    add x9, x0, #29
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[13], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load88
-; NONEON-NOSVE-NEXT:    add x9, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[14], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load91
-; NONEON-NOSVE-NEXT:    add x8, x0, #31
-; NONEON-NOSVE-NEXT:    ld1 { v1.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #2064
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2024]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2016]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2031]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2030]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2029]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2028]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2027]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2026]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2025]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2023]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2184]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2021]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2020]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2019]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2088]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2018]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2017]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2008]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2080]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2016]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2015]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2014]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2013]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2096]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2012]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2011]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2010]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2009]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2007]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2006]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2005]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2004]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2003]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2002]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2001]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2050]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2052]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2054]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2056]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2058]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2060]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2034]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2036]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #2038]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #2040]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #2042]
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2044]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2062]
+; NONEON-NOSVE-NEXT:    add w13, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    add w10, w10, w13
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2046]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w12
+; NONEON-NOSVE-NEXT:    add w8, w9, w13
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_0
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1744
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB3_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #1999]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1984]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1984]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_3
+; NONEON-NOSVE-NEXT:    b .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1968]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1950]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1968]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1966]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1920]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1952]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1920]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1953]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1952]
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else47
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else59
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else62
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else65
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else68
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else71
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else74
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else77
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else80
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else83
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else86
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else89
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load91
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #31]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %else92
+; NONEON-NOSVE-NEXT:    add sp, sp, #2064
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1904]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1887]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1904]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1903]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1856]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1888]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1856]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1890]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1888]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1840]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1820]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1840]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1836]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1792]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1810]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1826]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1824]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1792]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1827]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1824]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1744]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1776]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1759]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1776]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1775]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1728]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1744]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1760]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1728]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1764]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1760]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1680]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1712]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1694]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1712]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1710]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1664]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1684]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1700]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1680]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1696]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1664]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1701]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1696]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1616]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1648]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1631]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1648]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1647]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1600]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1620]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1636]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1616]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1632]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1600]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1638]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1632]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1552]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1584]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1584]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1576]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1558]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1574]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1556]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1572]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1552]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1568]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1575]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1568]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1488]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1520]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1503]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1520]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1519]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1472]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1488]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1504]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1472]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1512]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1504]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1424]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1456]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1438]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1456]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1454]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1408]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1432]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1448]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1424]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1440]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1449]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1440]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1360]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1392]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1375]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1392]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1391]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1344]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1368]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1384]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1360]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1376]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1344]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1386]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1376]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1296]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1328]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1308]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1328]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1324]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1306]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1322]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1304]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1320]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1296]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1312]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1323]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1312]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1232]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1264]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1247]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1264]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1263]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1216]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1256]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1232]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1216]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1260]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1248]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1200]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1182]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1200]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1198]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1180]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1192]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1168]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1197]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1184]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1119]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1135]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1116]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1132]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1128]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1104]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1134]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1120]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1024]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1072]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1038]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1072]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1070]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1036]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1068]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1032]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1064]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1024]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1056]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1071]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1056]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load46
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #976]
+; NONEON-NOSVE-NEXT:    add x10, sp, #976
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #991]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #1008]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1007]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldurh w9, [x10, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x10, #9]
+; NONEON-NOSVE-NEXT:    sturh w9, [x10, #29]
+; NONEON-NOSVE-NEXT:    ldur x9, [x10, #1]
+; NONEON-NOSVE-NEXT:    stur w11, [x10, #25]
+; NONEON-NOSVE-NEXT:    stur x9, [x10, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #960]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #992]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load49
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #17]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #928]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load52
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load55
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #19]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load58
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load61
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #21]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load64
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load67
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #23]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load70
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load73
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #25]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load76
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load79
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #27]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load82
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load85
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #29]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load88
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_34
+; NONEON-NOSVE-NEXT:    b .LBB3_35
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -638,27 +1642,36 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
-; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
-; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB4_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_3
+; NONEON-NOSVE-NEXT:    b .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
@@ -678,39 +1691,84 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
-; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB5_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_3
+; NONEON-NOSVE-NEXT:    b .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
-; NONEON-NOSVE-NEXT:  .LBB5_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
-; NONEON-NOSVE-NEXT:  .LBB5_4: // %else8
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
-; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
 ; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_6
 ; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
@@ -731,62 +1789,184 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b1, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
-; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
-; NONEON-NOSVE-NEXT:  .LBB6_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
-; NONEON-NOSVE-NEXT:  .LBB6_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
-; NONEON-NOSVE-NEXT:  .LBB6_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
-; NONEON-NOSVE-NEXT:  .LBB6_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
-; NONEON-NOSVE-NEXT:  .LBB6_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
-; NONEON-NOSVE-NEXT:  .LBB6_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
-; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
-; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
-; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
-; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
-; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
-; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
-; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB6_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_3
+; NONEON-NOSVE-NEXT:    b .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_9
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_10
+; NONEON-NOSVE-NEXT:    b .LBB6_11
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -814,113 +1994,383 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.8h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
-; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
-; NONEON-NOSVE-NEXT:  .LBB7_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
-; NONEON-NOSVE-NEXT:  .LBB7_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
-; NONEON-NOSVE-NEXT:  .LBB7_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
-; NONEON-NOSVE-NEXT:  .LBB7_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
-; NONEON-NOSVE-NEXT:  .LBB7_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
-; NONEON-NOSVE-NEXT:  .LBB7_8: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
-; NONEON-NOSVE-NEXT:  .LBB7_9: // %else23
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
-; NONEON-NOSVE-NEXT:  .LBB7_10: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
-; NONEON-NOSVE-NEXT:  .LBB7_11: // %else29
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
-; NONEON-NOSVE-NEXT:  .LBB7_12: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
-; NONEON-NOSVE-NEXT:  .LBB7_13: // %else35
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
-; NONEON-NOSVE-NEXT:  .LBB7_14: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
-; NONEON-NOSVE-NEXT:  .LBB7_15: // %else41
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
-; NONEON-NOSVE-NEXT:  .LBB7_16: // %else44
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
-; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
-; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
-; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #6
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
-; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
-; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #10
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
-; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
-; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x9, x0, #14
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
-; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load22
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
-; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load25
-; NONEON-NOSVE-NEXT:    add x9, x0, #18
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
-; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load28
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
-; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load31
-; NONEON-NOSVE-NEXT:    add x9, x0, #22
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
-; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load34
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[4], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
-; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load37
-; NONEON-NOSVE-NEXT:    add x9, x0, #26
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[5], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
-; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load40
-; NONEON-NOSVE-NEXT:    add x9, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[6], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
-; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load43
-; NONEON-NOSVE-NEXT:    add x8, x0, #30
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w12
+; NONEON-NOSVE-NEXT:    add w11, w13, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w10, w11, w15
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB7_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h2, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #960]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #974]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_3
+; NONEON-NOSVE-NEXT:    b .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #924]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #940]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #912]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #896]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #930]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #862]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #878]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #832]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #868]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #792]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #808]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #788]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #804]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #768]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #806]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #734]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #750]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #704]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #736]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #668]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #656]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #664]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #672]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #640]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #600]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #592]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #606]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #608]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #622]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #576]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #520]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #512]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #524]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #544]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #528]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    str h2, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #478]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #494]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #2]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_17
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_18
+; NONEON-NOSVE-NEXT:    b .LBB7_19
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -939,27 +2389,38 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_4
-; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB8_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
-; NONEON-NOSVE-NEXT:  .LBB8_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x8]
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_3
+; NONEON-NOSVE-NEXT:    b .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
@@ -980,37 +2441,80 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h1, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_6
-; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #208]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB9_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #204]
+; NONEON-NOSVE-NEXT:    stur xzr, [sp, #196]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_3
+; NONEON-NOSVE-NEXT:    b .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
-; NONEON-NOSVE-NEXT:  .LBB9_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
-; NONEON-NOSVE-NEXT:  .LBB9_4: // %else8
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB9_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
-; NONEON-NOSVE-NEXT:  .LBB9_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
 ; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_6
 ; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
@@ -1064,63 +2568,170 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
 ; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv b2, v0.8b
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_9
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_10
-; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_11
-; NONEON-NOSVE-NEXT:  .LBB10_3: // %else5
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_12
-; NONEON-NOSVE-NEXT:  .LBB10_4: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_13
-; NONEON-NOSVE-NEXT:  .LBB10_5: // %else11
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_14
-; NONEON-NOSVE-NEXT:  .LBB10_6: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_15
-; NONEON-NOSVE-NEXT:  .LBB10_7: // %else17
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_16
-; NONEON-NOSVE-NEXT:  .LBB10_8: // %else20
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB10_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w9, w9, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w9, w10
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB10_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
-; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB10_3
-; NONEON-NOSVE-NEXT:  .LBB10_11: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_4
-; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x9, x0, #12
-; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_5
-; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load10
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_6
-; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load13
-; NONEON-NOSVE-NEXT:    add x9, x0, #20
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_7
-; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load16
-; NONEON-NOSVE-NEXT:    add x9, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[2], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_8
-; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load19
-; NONEON-NOSVE-NEXT:    add x8, x0, #28
-; NONEON-NOSVE-NEXT:    ld1 { v1.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #460]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #244]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_3
+; NONEON-NOSVE-NEXT:    b .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #412]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #408]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_13
+; NONEON-NOSVE-NEXT:  .LBB10_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_14
+; NONEON-NOSVE-NEXT:  .LBB10_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_15
+; NONEON-NOSVE-NEXT:  .LBB10_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_16
+; NONEON-NOSVE-NEXT:  .LBB10_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_11
+; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB10_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #348]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #340]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_6
+; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #256]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #264]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_7
+; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #220]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #4]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #20]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_8
+; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_9
+; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s1, s3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_10
+; NONEON-NOSVE-NEXT:    b .LBB10_11
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -1140,25 +2751,38 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_3
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_4
-; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
-; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB11_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
-; NONEON-NOSVE-NEXT:  .LBB11_4: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x8, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_3
+; NONEON-NOSVE-NEXT:    b .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
@@ -1188,38 +2812,74 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI12_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    addv h2, v0.4h
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
-; NONEON-NOSVE-NEXT:    fmov w8, s2
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB12_5
-; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_6
-; NONEON-NOSVE-NEXT:  .LBB12_2: // %else2
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI12_0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI12_0]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x4
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w12, #0, #1
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB12_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_3
+; NONEON-NOSVE-NEXT:    b .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB12_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
-; NONEON-NOSVE-NEXT:  .LBB12_3: // %else5
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
-; NONEON-NOSVE-NEXT:  .LBB12_4: // %else8
+; NONEON-NOSVE-NEXT:  .LBB12_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB12_5: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_2
-; NONEON-NOSVE-NEXT:  .LBB12_6: // %cond.load1
-; NONEON-NOSVE-NEXT:    add x9, x0, #8
-; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB12_3
 ; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
-; NONEON-NOSVE-NEXT:    add x9, x0, #16
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[0], [x9]
-; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_4
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_6
 ; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
-; NONEON-NOSVE-NEXT:    add x8, x0, #24
-; NONEON-NOSVE-NEXT:    ld1 { v1.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
 ; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
@@ -1249,34 +2909,51 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
 ; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
 ; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
 ; NONEON-NOSVE-NEXT:    b .LBB13_4
 ; NONEON-NOSVE-NEXT:  .LBB13_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI13_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI13_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
 ; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
-; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
@@ -1307,34 +2984,51 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ;
 ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #16
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
 ; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
 ; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
 ; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
 ; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
 ; NONEON-NOSVE-NEXT:    b .LBB14_4
 ; NONEON-NOSVE-NEXT:  .LBB14_2:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI14_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI14_0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
 ; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
-; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add x9, x0, #2
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
-; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
-; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
 ; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
 ; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
-; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
-; NONEON-NOSVE-NEXT:    add x8, x0, #4
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
 ; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
-; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 0904399558aee1..a79ce9db9abfde 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -23,13 +23,21 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
@@ -38,6 +46,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
 ; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -50,6 +59,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
 ; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -69,14 +79,39 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB1_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB1_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
 ; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
@@ -92,6 +127,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
 ; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -116,6 +152,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
 ; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -135,15 +172,89 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
@@ -176,6 +287,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
 ; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
@@ -224,6 +336,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
 ; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -308,241 +421,328 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    fmov s1, w1
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
-; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
-; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
-; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
-; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
-; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
-; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
-; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
-; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
-; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
-; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
-; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
-; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
-; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
-; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
-; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
 ; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
-; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
-; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
-; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
-; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
-; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    addv h1, v1.8h
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s1
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w12, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w13
+; NONEON-NOSVE-NEXT:    add w8, w9, w12
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_34
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
-; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_35
 ; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
-; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
 ; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
-; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
 ; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
-; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
 ; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
-; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
 ; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
-; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
 ; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
-; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
 ; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
-; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
 ; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
-; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
 ; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
-; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
 ; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
-; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
 ; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
-; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
 ; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
-; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
 ; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
-; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
 ; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
-; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
 ; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
-; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
 ; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
-; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
 ; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
-; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
 ; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
-; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
 ; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
-; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
 ; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
-; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
 ; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
-; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
 ; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
-; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
 ; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
-; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
 ; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
-; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
 ; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
-; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
 ; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
-; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
 ; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
-; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
 ; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
-; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
 ; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
-; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
 ; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
-; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
-; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else62
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
-; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
-; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store1
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
 ; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
-; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store3
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
-; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store5
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
 ; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
-; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store7
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
 ; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
-; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store9
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
 ; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
-; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store11
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
-; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store13
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
 ; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
-; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store15
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
 ; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
-; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store17
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
 ; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
-; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store19
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
 ; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
-; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store21
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
 ; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
-; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store23
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
 ; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
-; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store25
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
 ; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
-; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store27
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
 ; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
-; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store29
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
 ; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
-; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store31
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
 ; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
-; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store33
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
 ; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
-; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store35
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
 ; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
-; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store37
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
 ; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
-; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store39
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
 ; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
-; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store41
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
 ; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
-; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store43
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
 ; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
-; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store45
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
 ; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
-; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store47
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
 ; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
-; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store49
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
 ; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
-; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store51
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
 ; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
-; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store53
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
 ; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
-; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store55
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
 ; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
-; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store57
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
 ; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
-; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store59
 ; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
-; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
-; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
-; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
-; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:    b .LBB3_33
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -571,17 +771,18 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
 ; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -590,6 +791,7 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
@@ -609,13 +811,21 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
@@ -624,6 +834,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
 ; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -640,6 +851,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -660,14 +872,39 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB6_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
 ; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
@@ -683,6 +920,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
 ; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -715,6 +953,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -743,15 +982,89 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
-; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    addv h0, v0.8h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
@@ -784,6 +1097,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
 ; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
 ; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
@@ -848,6 +1162,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
 ; NONEON-NOSVE-NEXT:    fmov s0, wzr
 ; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -868,13 +1183,21 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
@@ -883,6 +1206,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
 ; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    str wzr, [x0]
@@ -895,6 +1219,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
 ; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -949,14 +1274,39 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv b0, v0.8b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
-; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB9_9
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
 ; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
@@ -972,6 +1322,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
 ; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
 ; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
 ; NONEON-NOSVE-NEXT:    str wzr, [x0]
@@ -996,6 +1347,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
 ; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
 ; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -1016,23 +1368,25 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
 ; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
 ; NONEON-NOSVE-NEXT:    str xzr, [x0]
 ; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
 ; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
 ; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
@@ -1061,13 +1415,21 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
-; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
-; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    addv h0, v0.4h
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
 ; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
 ; NONEON-NOSVE-NEXT:  // %bb.1: // %else
 ; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
@@ -1076,6 +1438,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
 ; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
 ; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
 ; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
 ; NONEON-NOSVE-NEXT:    str xzr, [x0]
@@ -1088,6 +1451,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
 ; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
 ; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index 6a6b47e815ac16..dbdf5f25029998 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -18,11 +18,22 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [x1, #2]
+; NONEON-NOSVE-NEXT:    ldrb w13, [x0]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w14, [x1, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    add w8, w11, w14
+; NONEON-NOSVE-NEXT:    add w9, w13, w9
+; NONEON-NOSVE-NEXT:    strb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strb w9, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
@@ -42,10 +53,46 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
@@ -65,10 +112,77 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
@@ -89,11 +203,143 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: add_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -116,17 +362,12 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
 ; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
-; NONEON-NOSVE-NEXT:    fmov s0, w8
-; NONEON-NOSVE-NEXT:    fmov s1, w9
-; NONEON-NOSVE-NEXT:    add x8, x0, #2
-; NONEON-NOSVE-NEXT:    add x9, x1, #2
-; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
-; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    mov w8, v0.s[1]
-; NONEON-NOSVE-NEXT:    fmov w9, s0
-; NONEON-NOSVE-NEXT:    strh w9, [x0]
-; NONEON-NOSVE-NEXT:    strh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [x1, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    strh w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w9, [x0, #2]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
@@ -146,10 +387,30 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
@@ -169,10 +430,45 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -193,11 +489,79 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ;
 ; NONEON-NOSVE-LABEL: add_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -218,8 +582,18 @@ define void @abs_v2i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
@@ -239,8 +613,25 @@ define void @abs_v4i32(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
@@ -260,10 +651,40 @@ define void @abs_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
@@ -283,8 +704,18 @@ define void @abs_v2i64(ptr %a) {
 ; NONEON-NOSVE-LABEL: abs_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
@@ -304,10 +735,26 @@ define void @abs_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: abs_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
-; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
@@ -328,13 +775,32 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ldr s1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [x1]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
@@ -355,13 +821,42 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
@@ -382,17 +877,69 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v2.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
@@ -415,25 +962,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
-; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
-; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
-; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
-; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
-; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
-; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
-; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
-; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -454,10 +1103,20 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ldr d1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
@@ -478,10 +1137,25 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
@@ -504,11 +1178,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -529,10 +1231,19 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
@@ -555,11 +1266,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 03bb899c517b4e..8c23f5f9922da7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -19,10 +19,70 @@ define void @test_revbv16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
@@ -43,10 +103,70 @@ define void @test_revbv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
@@ -67,10 +187,70 @@ define void @test_revbv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revbv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
@@ -91,10 +271,34 @@ define void @test_revhv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -115,10 +319,34 @@ define void @test_revhv8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -139,10 +367,34 @@ define void @test_revhv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -163,10 +415,22 @@ define void @test_revwv4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -187,10 +451,22 @@ define void @test_revwv4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -210,7 +486,42 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; NONEON-NOSVE-LABEL: test_revv16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
@@ -230,10 +541,22 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
@@ -258,14 +581,58 @@ define void @test_revhv32i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revhv32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    rev64 v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    rev64 v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
@@ -285,10 +652,18 @@ define void @test_rev_elts_fail(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_rev_elts_fail:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -358,12 +733,23 @@ define void @test_revv8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: test_revv8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index f254a1f9098f2d..bc6fdd1ecd5a71 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -72,14 +72,82 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
@@ -212,24 +280,149 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q4, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q5, q1, [x0]
-; NONEON-NOSVE-NEXT:    ldp q6, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v17.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip2 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    zip1 v16.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip2 v1.8h, v1.8h, v3.8h
-; NONEON-NOSVE-NEXT:    zip1 v2.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip1 v3.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    zip2 v5.8h, v5.8h, v7.8h
-; NONEON-NOSVE-NEXT:    zip2 v4.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v6.8h, v16.8h, v17.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    stp q6, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    stp q6, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
@@ -282,14 +475,50 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
@@ -326,14 +555,26 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
@@ -360,15 +601,28 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
@@ -405,12 +659,29 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
@@ -436,12 +707,22 @@ define void @zip1_v8i32_undef(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -465,15 +746,131 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn2 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    trn1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    trn2 v2.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -500,15 +897,32 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
-; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
-; NONEON-NOSVE-NEXT:    ldr q1, [x0]
-; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
-; NONEON-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
-; NONEON-NOSVE-NEXT:    tbl v0.16b, { v1.16b }, v0.16b
-; NONEON-NOSVE-NEXT:    tbl v1.16b, { v1.16b }, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w10, w9, w8
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w11, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
@@ -535,15 +949,79 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v4.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    trn1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    trn2 v2.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
@@ -570,15 +1048,25 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn1 v1.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    trn2 v2.4s, v2.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
@@ -606,15 +1094,25 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    fadd v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
@@ -639,12 +1137,23 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
@@ -670,14 +1179,24 @@ define void @trn_v8i32_undef(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: trn_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    trn1 v3.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    trn2 v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -753,14 +1272,82 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
@@ -811,14 +1398,50 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
@@ -855,14 +1478,26 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1]
 ; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
@@ -886,12 +1521,22 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ;
 ; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -1097,15 +1742,131 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -1133,12 +1894,21 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; NONEON-NOSVE-LABEL: uzp_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    ext v2.8b, v0.8b, v0.8b, #2
-; NONEON-NOSVE-NEXT:    trn1 v1.4h, v0.4h, v1.4h
-; NONEON-NOSVE-NEXT:    zip1 v0.4h, v2.4h, v0.4h
-; NONEON-NOSVE-NEXT:    add v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
@@ -1260,15 +2030,79 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
@@ -1312,15 +2146,31 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp2 v2.4s, v3.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v0.4s, v4.4s, v0.4s
-; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x8, #9205357640488583168 // =0x7fc000007fc00000
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov w8, #2143289344 // =0x7fc00000
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
@@ -1347,15 +2197,27 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v2.2d, v3.2d, v2.2d
-; NONEON-NOSVE-NEXT:    add v0.2d, v4.2d, v0.2d
-; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
@@ -1427,12 +2289,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ;
 ; NONEON-NOSVE-LABEL: uzp_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
@@ -1476,10 +2371,23 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
@@ -1507,15 +2415,28 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: zip_vscale2_4:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
-; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
-; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
-; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 41d2cb8a2c7564..8ebf713a671f49 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -39,19 +39,76 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #255 // =0xff
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w9, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w10, w11, w10, hi
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
@@ -113,29 +170,144 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_or_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    orn v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w10, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w12, w12, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w14, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w12, w14, w12
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w18, w18, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w0, w0, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w1, w1, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csinv w2, w2, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csinv w3, w3, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w4, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
@@ -207,29 +379,144 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: ptest_and_v16i1:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
-; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
-; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
-; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
-; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w9, w9, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w10, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w11, w11, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w12, w12, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w14, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w12, w14, w12
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w13, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w15, w15, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csel w16, w16, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csel w17, w17, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w18, w18, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w0, w0, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w1, w1, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csel w2, w2, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w3, w3, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w4, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
 ; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 5626f77c684f22..bc0fc7c79391d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -22,9 +22,26 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -41,7 +58,42 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -58,7 +110,74 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
@@ -76,10 +195,140 @@ define void @bitreverse_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
@@ -99,9 +348,17 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -118,8 +375,26 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -136,8 +411,42 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -155,12 +464,76 @@ define void @bitreverse_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
@@ -179,8 +552,15 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -197,8 +577,20 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -216,12 +608,32 @@ define void @bitreverse_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
@@ -240,8 +652,13 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -258,8 +675,15 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -277,12 +701,22 @@ define void @bitreverse_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bitreverse_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
@@ -306,8 +740,31 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
@@ -324,7 +781,26 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
@@ -341,7 +817,42 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
@@ -359,10 +870,79 @@ define void @bswap_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
@@ -381,7 +961,26 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
@@ -398,7 +997,42 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
@@ -416,10 +1050,79 @@ define void @bswap_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
@@ -438,7 +1141,26 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
@@ -455,7 +1177,42 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
@@ -473,10 +1230,79 @@ define void @bswap_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: bswap_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 55f4f5bae641e5..df019ce2e0ad67 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -18,15 +18,38 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
-; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
-; NONEON-NOSVE-NEXT:    ushr v1.4h, v1.4h, #7
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #3
-; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp]
+; NONEON-NOSVE-NEXT:    sxtb w11, w8
+; NONEON-NOSVE-NEXT:    sxtb w13, w9
+; NONEON-NOSVE-NEXT:    sxtb w14, w10
+; NONEON-NOSVE-NEXT:    sxtb w15, w12
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w13, w13, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w14, w14, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    ubfx w11, w15, #10, #5
+; NONEON-NOSVE-NEXT:    add w9, w9, w13
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    add w10, w10, w14
+; NONEON-NOSVE-NEXT:    sxtb w9, w9
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    sxtb w10, w10
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    sxtb w11, w11
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #5
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
@@ -43,9 +66,58 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8b, v0.8b, #0
-; NONEON-NOSVE-NEXT:    usra v0.8b, v1.8b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.8b, v0.8b, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
@@ -62,9 +134,106 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v1.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
@@ -82,14 +251,204 @@ define void @sdiv_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.16b, v0.16b, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.16b, v1.16b, #0
-; NONEON-NOSVE-NEXT:    usra v0.16b, v2.16b, #3
-; NONEON-NOSVE-NEXT:    usra v1.16b, v3.16b, #3
-; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
-; NONEON-NOSVE-NEXT:    sshr v1.16b, v1.16b, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
@@ -109,16 +468,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    shl v1.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
-; NONEON-NOSVE-NEXT:    dup v2.2s, w8
-; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
-; NONEON-NOSVE-NEXT:    ushr v1.2s, v1.2s, #26
-; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
-; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
-; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    sxth w10, w8
+; NONEON-NOSVE-NEXT:    sxth w11, w9
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #26, #5
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w11
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #5, #11
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #5, #11
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
@@ -135,9 +498,34 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4h, v0.4h, #0
-; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
@@ -154,9 +542,58 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v1.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
@@ -174,14 +611,108 @@ define void @sdiv_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.8h, v0.8h, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.8h, v1.8h, #0
-; NONEON-NOSVE-NEXT:    usra v0.8h, v2.8h, #11
-; NONEON-NOSVE-NEXT:    usra v1.8h, v3.8h, #11
-; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
-; NONEON-NOSVE-NEXT:    sshr v1.8h, v1.8h, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
@@ -200,9 +731,19 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2s, v0.2s, #0
-; NONEON-NOSVE-NEXT:    usra v0.2s, v1.2s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
@@ -219,9 +760,28 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v1.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
@@ -239,14 +799,48 @@ define void @sdiv_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.4s, v0.4s, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.4s, v1.4s, #0
-; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #27
-; NONEON-NOSVE-NEXT:    usra v1.4s, v3.4s, #27
-; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
-; NONEON-NOSVE-NEXT:    sshr v1.4s, v1.4s, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
@@ -265,9 +859,15 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt d1, d0, #0
-; NONEON-NOSVE-NEXT:    usra d0, d1, #59
-; NONEON-NOSVE-NEXT:    sshr d0, d0, #5
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
@@ -285,9 +885,19 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    cmlt v1.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v1.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
@@ -305,14 +915,30 @@ define void @sdiv_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
-; NONEON-NOSVE-NEXT:    cmlt v2.2d, v0.2d, #0
-; NONEON-NOSVE-NEXT:    cmlt v3.2d, v1.2d, #0
-; NONEON-NOSVE-NEXT:    usra v0.2d, v2.2d, #59
-; NONEON-NOSVE-NEXT:    usra v1.2d, v3.2d, #59
-; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
-; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #5
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 38aaf860b7298c..b66e6d90135730 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -18,9 +18,15 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 ;
 ; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
   %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -39,9 +45,25 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_without_splat:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -64,12 +86,40 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_legalization:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v4.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip1 v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip2 v3.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
                                                                              i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index e15529e1926ac7..a4cf5d608fed6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -19,7 +19,14 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -35,7 +42,18 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
@@ -51,7 +69,25 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -67,8 +103,27 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -85,7 +140,11 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
@@ -101,7 +160,14 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -117,7 +183,17 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -133,8 +209,19 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -151,7 +238,11 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -167,7 +258,11 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -183,8 +278,13 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -201,7 +301,11 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov d0, x0
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str x0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
@@ -217,7 +321,9 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -233,8 +339,11 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -256,8 +365,12 @@ define <2 x half> @splat_v2f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
@@ -274,8 +387,14 @@ define <4 x half> @splat_v4f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
@@ -292,8 +411,17 @@ define <8 x half> @splat_v8f16(half %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
@@ -310,9 +438,19 @@ define void @splat_v16f16(half %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
@@ -330,8 +468,11 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
@@ -348,8 +489,11 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
@@ -366,9 +510,13 @@ define void @splat_v8f32(float %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
@@ -383,6 +531,11 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
@@ -399,8 +552,9 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
@@ -417,9 +571,11 @@ define void @splat_v4f64(double %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: splat_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
@@ -440,7 +596,8 @@ define void @splat_imm_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI24_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI24_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
@@ -458,8 +615,8 @@ define void @splat_imm_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #2 // =0x2
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI25_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI25_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
@@ -477,8 +634,8 @@ define void @splat_imm_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #3 // =0x3
-; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
@@ -496,8 +653,8 @@ define void @splat_imm_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #4 // =0x4
-; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
@@ -519,8 +676,8 @@ define void @splat_imm_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #17664 // =0x4500
-; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI28_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI28_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
@@ -538,7 +695,8 @@ define void @splat_imm_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.4s, #6.00000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI29_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI29_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
@@ -556,7 +714,8 @@ define void @splat_imm_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: splat_imm_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    fmov v0.2d, #7.00000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI30_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI30_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index f055061b13bed6..a77ac7832e17cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -31,7 +31,8 @@ define void @store_v8i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
@@ -47,7 +48,8 @@ define void @store_v16i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
@@ -63,7 +65,8 @@ define void @store_v32i8(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
@@ -96,7 +99,14 @@ define void @store_v2f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
@@ -111,7 +121,8 @@ define void @store_v4i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI6_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
@@ -127,7 +138,8 @@ define void @store_v4f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI7_0]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
@@ -143,7 +155,8 @@ define void @store_v8i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
@@ -159,7 +172,8 @@ define void @store_v8f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
 ; NONEON-NOSVE-NEXT:    str q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
@@ -175,7 +189,8 @@ define void @store_v16i16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
@@ -191,7 +206,8 @@ define void @store_v16f16(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI11_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
@@ -263,7 +279,8 @@ define void @store_v8i32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
@@ -279,7 +296,8 @@ define void @store_v8f32(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI17_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
@@ -295,8 +313,12 @@ define void @store_v1i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v1i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
@@ -311,8 +333,12 @@ define void @store_v1f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v1f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
@@ -355,7 +381,8 @@ define void @store_v4i64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI22_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI22_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
@@ -371,7 +398,8 @@ define void @store_v4f64(ptr %a) {
 ;
 ; NONEON-NOSVE-LABEL: store_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI23_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI23_0]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 80c9ef87e9b915..a9f4d92b1e6b64 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -27,8 +27,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ;
 ; NONEON-NOSVE-LABEL: subvector_v4i8:
 ; NONEON-NOSVE:       // %bb.0: // %bb1
-; NONEON-NOSVE-NEXT:    ldr w8, [x0]
-; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 41b68e10e75ded..30682751037fe5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -17,8 +17,27 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
@@ -37,9 +56,15 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
@@ -58,8 +83,17 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
@@ -78,8 +112,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
-; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
@@ -99,10 +138,15 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ;
 ; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr d0, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldr d1, [x0]
-; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
-; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr x8, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [x0]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 8242b4e26d5057..bc046059f0bd59 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -24,7 +24,41 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
@@ -51,13 +85,125 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #91]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #89]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #87]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #85]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w6, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    add w5, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #83]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #81]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #109]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #107]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #105]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #103]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #101]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #95]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #97]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
@@ -97,20 +243,276 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #448
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #416] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v6.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #230]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #270]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #266]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #262]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #258]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #254]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #250]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #242]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #174]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #220]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #170]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #166]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #162]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #335]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #333]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #331]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #329]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #327]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #325]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #323]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #321]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #303]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #350]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #349]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #347]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #346]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #345]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #343]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #342]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #341]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #339]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #338]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #337]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #448
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
@@ -172,34 +574,598 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v5.16b, v4.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v7.16b, v6.16b
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #408] // 8-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v16.16b, v1.16b
-; NONEON-NOSVE-NEXT:    uzp1 v5.16b, v17.16b, v5.16b
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v4.16b, v4.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.16b, v18.16b, v7.16b
-; NONEON-NOSVE-NEXT:    add v3.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v17.16b, v16.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.16b, v5.16b, v5.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v4.16b, v7.16b, v7.16b
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.16b, v6.16b, v6.16b
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #606]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #600]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #598]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #596]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #594]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w8, [sp, #404] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #400] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #544]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #392] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #638]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #636]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #634]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #666]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #630]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #384] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #628]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #626]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #622]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #618]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #614]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #376] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #610]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #430]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #422]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #368] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #418]
+; NONEON-NOSVE-NEXT:    strb w30, [sp, #767]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #494]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #658]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #662]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #668]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #252] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #244] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #530]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #236] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #534]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #228] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #538]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #220] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #542]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #212] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #502]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #196] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #506]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #188] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #510]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #180] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #172] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #156] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #148] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #642]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #644]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #140] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #646]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #650]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #652]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #124] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #654]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #116] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #578]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #580]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #108] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #582]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #100] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #586]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #92] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #590]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #84] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #76] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #68] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #602]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #604]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #765]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #764]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #763]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #762]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #761]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #760]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #759]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #758]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #757]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #756]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #755]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #754]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #753]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #752]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #751]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #750]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #749]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #748]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #747]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #746]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #745]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #743]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #742]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #741]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #740]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #739]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #738]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #737]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #766]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #734]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #733]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #732]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #731]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #730]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #729]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #727]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #726]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #725]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #724]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #723]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #722]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #721]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #783]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #782]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #781]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #780]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #779]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #778]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #777]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #776]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #775]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #774]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #773]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #772]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #771]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #770]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #769]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #719]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #718]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #717]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #716]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #715]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #172] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #714]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #713]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #180] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #711]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #710]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #709]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #196] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #708]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #707]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #706]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #705]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #212] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #704]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #799]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #220] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #798]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #797]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #228] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #232] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #795]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #236] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #794]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #793]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #244] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #792]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #248] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #791]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #252] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #790]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #789]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #788]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #787]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #786]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #785]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #280] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #768]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #687]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #686]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #685]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #292] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #683]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #681]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #312] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #679]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #678]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #675]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #674]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #673]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #340] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #344] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #703]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #348] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #702]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #701]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #356] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #700]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #699]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #698]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #697]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #695]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #694]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #693]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #692]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #691]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #690]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #689]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #408] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #672]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x8, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
@@ -227,8 +1193,21 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
@@ -256,11 +1235,38 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
@@ -302,19 +1308,113 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #155]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #153]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #151]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #149]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #147]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #145]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #175]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #173]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #171]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #169]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #167]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #165]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #163]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #159]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #157]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #161]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
@@ -383,32 +1483,273 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v16.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v19.8h, v18.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v4.16b, v6.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v1.16b, v7.16b
-; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w20, w8, w8
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #312]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #396]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #392]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #384]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #340]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #376]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #463]
+; NONEON-NOSVE-NEXT:    add w20, w22, w22
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w29, w28, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp w8, w30, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #461]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #459]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #457]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #455]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #453]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #451]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #449]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #447]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #445]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #443]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #441]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #439]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #437]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #435]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #433]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #431]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #429]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #427]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #423]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #421]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #419]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #417]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #477]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #475]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #473]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #471]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
@@ -435,7 +1776,21 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
@@ -462,13 +1817,54 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w4, w5, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
@@ -508,20 +1904,115 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #224] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v6.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #190]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #188]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
@@ -583,34 +2074,276 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    mov x5, x1
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v16.8h, v1.8h
-; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v4.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v18.8h, v7.8h
-; NONEON-NOSVE-NEXT:    add v3.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v17.8h, v16.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.8h, v5.8h, v5.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v4.8h, v7.8h, v7.8h
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.8h, v6.8h, v6.8h
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #376]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #296]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #344]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #304]
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #494]
+; NONEON-NOSVE-NEXT:    add w21, w23, w23
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #492]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp w0, w18, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp w2, w1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #510]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #506]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #502]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #432]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #496]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #410]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #406]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #402]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #400]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x5]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x5, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x5, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x5, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
@@ -639,8 +2372,15 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
@@ -669,12 +2409,27 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
@@ -717,17 +2472,47 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q7, q5, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #143]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #141]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #139]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
@@ -798,31 +2583,139 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v16.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v21.4s, v20.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v4.8h, v16.8h
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v2.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v5.8h
-; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v2.16b, v3.16b
-; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
-; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #416
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #216]
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #232]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    str q18, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #280]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #160]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #303]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #416
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
@@ -850,8 +2743,15 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
@@ -879,11 +2779,27 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
@@ -925,19 +2841,66 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
@@ -1006,32 +2969,140 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
-; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v16.4s, v7.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v19.4s, v18.4s
-; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
-; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
-; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v7.8h
-; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v3.8h
-; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
-; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
-; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #432
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #248]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #208]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #152]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #272]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #432
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
@@ -1058,7 +3129,13 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
@@ -1085,13 +3162,34 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
@@ -1131,20 +3229,60 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
 ; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v6.4s, v1.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
@@ -1206,34 +3344,145 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
-; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
-; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
-; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #464] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
-; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v16.4s, v1.4s
-; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
-; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
-; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
-; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v18.4s, v7.4s
-; NONEON-NOSVE-NEXT:    add v3.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v17.4s, v16.4s
-; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
-; NONEON-NOSVE-NEXT:    add v0.4s, v5.4s, v5.4s
-; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
-; NONEON-NOSVE-NEXT:    add v4.4s, v7.4s, v7.4s
-; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
-; NONEON-NOSVE-NEXT:    add v1.4s, v6.4s, v6.4s
-; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
-; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #480] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #168]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #152]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w9, [sp, #344]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #336]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #248]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #264]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #376]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    str w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    str w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    str w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w5, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w6, [sp, #360]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    str w5, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w6, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #480] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #464] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
 ; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 874af15e211177..323f5f56a2c085 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -18,8 +18,17 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
-; NONEON-NOSVE-NEXT:    trn1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
@@ -38,7 +47,19 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
@@ -57,7 +78,20 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -80,11 +114,35 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #15
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -107,7 +165,12 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    rev64 v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
@@ -126,7 +189,17 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
@@ -145,7 +218,18 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
@@ -167,11 +251,31 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -194,7 +298,13 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
@@ -213,7 +323,16 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
@@ -235,11 +354,26 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -261,7 +395,12 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
@@ -283,11 +422,20 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x10, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -309,7 +457,17 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
@@ -327,7 +485,18 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
@@ -347,11 +516,31 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -373,7 +562,13 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
@@ -391,7 +586,16 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
@@ -411,11 +615,26 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -436,7 +655,12 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
@@ -456,11 +680,20 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -483,11 +716,21 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ;
 ; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
-; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
-; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v0.16b, #8
-; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
-; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index e69f59aedc026f..67cdde718e391f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -43,7 +43,8 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ;
 ; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
 ; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
@@ -57,7 +58,8 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ;
 ; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
 ; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }

From a72a90677d2b320e3bca553698e99143034387d9 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@redhat.com>
Date: Wed, 29 May 2024 06:19:17 -0300
Subject: [PATCH 083/230] [Nomination] Add an extra Red Hat representative to
 the security group (#92174)

I'd like to nominate myself as another Red Hat representative. I work at
the LLVM team at Red Hat contributing to upstream and downstream.
---
 llvm/docs/Security.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 9140923e5e8c9d..a468ff51d2a6aa 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -55,6 +55,7 @@ username for an individual isn't available, the brackets will be empty.
 * Serge Guelton (Mozilla) [@serge-sans-paille]
 * Shayne Hiet-Block (Microsoft) [@GreatKeeper]
 * Tim Penge (Sony) []
+* Tulio Magno Quites Machado Filho (Red Hat) [@tuliom]
 * Will Huhn (Intel) [@wphuhn-intel]
 
 Criteria

From 0f7b4b04a548e10d0f552f13bebc21972d55d7f6 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 29 May 2024 17:30:14 +0800
Subject: [PATCH 084/230] [X86][Driver] Enable feature ccmp,nf for -mapxf

This is follow-up for #78901 after validation.
---
 clang/include/clang/Driver/Options.td         | 8 +++-----
 clang/lib/Basic/Targets/X86.cpp               | 2 +-
 clang/test/Driver/x86-target-features.c       | 4 ++--
 clang/test/Preprocessor/x86_target_features.c | 2 +-
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index de2f245fb29f8e..4119e69c85540e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6277,11 +6277,9 @@ def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Feature
     HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
     HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
-// Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE.
-// For stability, we turn on these features only for -mapxf. After a feature pass the validation,
-// we will add it to -mapxf.
-def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx", "ndd"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd"]>;
+// For stability, we only add a feature to -mapxf after it passes the validation of llvm-test-suite && cpu2017 on Intel SDE.
+def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
 } // let Flags = [TargetSpecific]
 
 // VE feature flags
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 3a30cff917bb4f..08e44360bfbe38 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -961,7 +961,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasCF)
     Builder.defineMacro("__CF__");
   // Condition here is aligned with the feature set of mapxf in Options.td
-  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD)
+  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF)
     Builder.defineMacro("__APX_F__");
 
   // Each case falls through to the previous one here.
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 1d5f001c23fcc0..3022ed1250d590 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -423,8 +423,8 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 7567267be26b42..6c08b379c93860 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -754,7 +754,7 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,APXF %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,APXF %s
 // APXF: #define __APX_F__ 1
 // CCMP: #define __CCMP__ 1
 // CF: #define __CF__ 1

From f3fb7f569936db418feef98e4ae68777a9a4cd2a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 10:31:40 +0100
Subject: [PATCH 085/230] [X86] x86-atomic-float.c - cleanup unused check
 prefixes

---
 clang/test/CodeGen/X86/x86-atomic-float.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/clang/test/CodeGen/X86/x86-atomic-float.c b/clang/test/CodeGen/X86/x86-atomic-float.c
index 2d3c72d2a0299f..6ee441c2dd7a8c 100644
--- a/clang/test/CodeGen/X86/x86-atomic-float.c
+++ b/clang/test/CodeGen/X86/x86-atomic-float.c
@@ -1,11 +1,11 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK64 %s
-// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK32 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
 
 
 // CHECK-LABEL: define dso_local i32 @test_int_inc(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @test_int_inc.n, i32 1 seq_cst, align 4
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
@@ -17,7 +17,7 @@ int test_int_inc()
 
 // CHECK-LABEL: define dso_local float @test_float_post_inc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_post_inc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -29,7 +29,7 @@ float test_float_post_inc()
 
 // CHECK-LABEL: define dso_local float @test_float_post_dc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_post_dc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -41,7 +41,7 @@ float test_float_post_dc()
 
 // CHECK-LABEL: define dso_local float @test_float_pre_dc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_pre_dc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
 // CHECK-NEXT:    ret float [[TMP1]]
@@ -54,7 +54,7 @@ float test_float_pre_dc()
 
 // CHECK-LABEL: define dso_local float @test_float_pre_inc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_pre_inc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
 // CHECK-NEXT:    ret float [[TMP1]]
@@ -64,6 +64,3 @@ float test_float_pre_inc()
     static _Atomic float n;
     return ++n;
 }
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// CHECK32: {{.*}}
-// CHECK64: {{.*}}

From 4bb6974a87e495f19faea4b13475a65e842473f0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 10:32:49 +0100
Subject: [PATCH 086/230] [X86] x86-atomic-long_double.c - cleanup check
 prefixes

---
 .../test/CodeGen/X86/x86-atomic-long_double.c | 573 +++++++++---------
 1 file changed, 287 insertions(+), 286 deletions(-)

diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c
index 74a22d5db151eb..2c3f381f13511e 100644
--- a/clang/test/CodeGen/X86/x86-atomic-long_double.c
+++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c
@@ -1,170 +1,171 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefix=CHECK32 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X64 %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X86 %s
 
-// CHECK-LABEL: define dso_local x86_fp80 @testinc(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @testinc(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testinc(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-LABEL: define dso_local x86_fp80 @testinc(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP3]]
 //
 long double testinc(_Atomic long double *addr) {
 
   return ++*addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testdec(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-LABEL: define dso_local x86_fp80 @testdec(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP2]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testdec(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @testdec(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double testdec(_Atomic long double *addr) {
 
   return (*addr)--;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testcompassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK:       atomic_op:
-// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
-// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
-// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK:       atomic_cont:
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+// X64-LABEL: define dso_local x86_fp80 @testcompassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*]]:
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP10]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testcompassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK32:       atomic_op:
-// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
-// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
-// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
-// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK32:       atomic_cont:
-// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+// X86-LABEL: define dso_local x86_fp80 @testcompassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*]]:
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// X86-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP5]]
 //
 long double testcompassign(_Atomic long double *addr) {
   *addr -= 25;
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @testassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @testassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double testassign(_Atomic long double *addr) {
   *addr = 115;
@@ -172,168 +173,168 @@ long double testassign(_Atomic long double *addr) {
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_inc(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_inc(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP3]]
 //
 long double test_volatile_inc(volatile _Atomic long double *addr) {
   return ++*addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_dec(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP2]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_dec(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double test_volatile_dec(volatile _Atomic long double *addr) {
   return (*addr)--;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK:       atomic_op:
-// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
-// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
-// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK:       atomic_cont:
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*]]:
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP10]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK32:       atomic_op:
-// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
-// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
-// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
-// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK32:       atomic_cont:
-// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*]]:
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// X86-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP5]]
 //
 long double test_volatile_compassign(volatile _Atomic long double *addr) {
   *addr -= 25;
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_assign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_assign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double test_volatile_assign(volatile _Atomic long double *addr) {
   *addr = 115;

From 9c42ed1371ee8c211aedcfe8aed16662a9befb69 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 10:34:49 +0100
Subject: [PATCH 087/230] [X86] Add x86-atomic-double.c double test coverage

---
 clang/test/CodeGen/X86/x86-atomic-double.c | 104 +++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 clang/test/CodeGen/X86/x86-atomic-double.c

diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c
new file mode 100644
index 00000000000000..2354c89cc2b170
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86-atomic-double.c
@@ -0,0 +1,104 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X64 %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X86 %s
+
+
+// X64-LABEL: define dso_local double @test_double_post_inc(
+// X64-SAME: ) #[[ATTR0:[0-9]+]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP1]]
+//
+// X86-LABEL: define dso_local double @test_double_post_inc(
+// X86-SAME: ) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+    static _Atomic double n;
+    return n++;
+}
+
+// X64-LABEL: define dso_local double @test_double_post_dc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP1]]
+//
+// X86-LABEL: define dso_local double @test_double_post_dc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+    static _Atomic double n;
+    return n--;
+}
+
+// X64-LABEL: define dso_local double @test_double_pre_dc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP2]]
+//
+// X86-LABEL: define dso_local double @test_double_pre_dc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+    static _Atomic double n;
+    return --n;
+}
+
+// X64-LABEL: define dso_local double @test_double_pre_inc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP2]]
+//
+// X86-LABEL: define dso_local double @test_double_pre_inc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+    static _Atomic double n;
+    return ++n;
+}

From f42de69213890f1203c1c3418a962e50de4ed73c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 10:37:46 +0100
Subject: [PATCH 088/230] [X86] vector-shuffle-512-v16.ll - add fast shuffle
 test coverage

---
 .../CodeGen/X86/vector-shuffle-512-v16.ll     | 181 ++++++++++++------
 1 file changed, 125 insertions(+), 56 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index c981d973fef3ed..bad0b411f68a95 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
 
 target triple = "x86_64-unknown-unknown"
 
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
 }
 
 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x float> %shuffle
 }
 
 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %tmp0 = bitcast <16 x i32> %a to <16 x float>
   %tmp1 = bitcast <16 x i32> %b to <16 x float>
   %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
 
 ; PR86076
 define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) {
-; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; SLOW-NEXT:    vbroadcastsd %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; FAST:       # %bb.0:
+; FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; FAST-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
+; FAST-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; FAST-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; FAST-NEXT:    retq
   %v0 = insertelement <8 x float> poison, float %a0, i64 0
   %v1 = insertelement <8 x float> poison, float %a1, i64 0
   %sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 }
 
 define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <16 x i32> %shuffle
 }
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
 
 ; PR46249
 define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
-; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <16 x i32> %1
 }
 
 define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
-; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <16 x float> %1
 }
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
 }
 
 define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) {
-; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
-; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
+; SLOW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = load <16 x float>, ptr %a1
   %2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
   ret <16 x float> %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 
 ;FIXME: can do better with vpcompress
 define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; FAST-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   ret <8 x i32> %res
 }
 
 ;FIXME: can do better with vpcompress
 define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_0_1_2_12:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vbroadcastss %xmm1, %xmm1
-; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; SLOW-LABEL: test_v16i32_0_1_2_12:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; SLOW-NEXT:    vbroadcastss %xmm1, %xmm1
+; SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SLOW-NEXT:    vzeroupper
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: test_v16i32_0_1_2_12:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FAST-NEXT:    vzeroupper
+; FAST-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
   ret <4 x i32> %res
 }
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
 }
 
 define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
-; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <16 x float> %shuffle
 }

From 74014b5a3497c1e9c7f0652d26f78fffea9bf51c Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Wed, 29 May 2024 17:39:38 +0800
Subject: [PATCH 089/230] Fix typo in AMDGPUUsage. NFC (#93652)

The vendor name is mesa but not mesa3d.
---
 llvm/docs/AMDGPUUsage.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 1004956ac8f103..b827524e6b8db4 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -64,7 +64,7 @@ to specify the target triple:
      Vendor       Description
      ============ ==============================================================
      ``amd``      Can be used for all AMD GPU usage.
-     ``mesa3d``   Can be used if the OS is ``mesa3d``.
+     ``mesa``     Can be used if the OS is ``mesa3d``.
      ============ ==============================================================
 
   .. table:: AMDGPU Operating Systems

From dc8da7ddeaa595a34827fc9e39322a8109e177f0 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <167427157+PankajDwivedi-25@users.noreply.github.com>
Date: Wed, 29 May 2024 15:10:44 +0530
Subject: [PATCH 090/230] [AMDGPU] Reserved private memory register during PEI
 (#93536)

- Reserved newly selected private memory registers in entry Function
Prologue generation.
- Added assertion patch in eliminateFrameIndex to ensure register is
reserved.

Co-authored-by: PankajDwivedi-25 <pankajkumar.divedi@amd.com>
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 1 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d77..97a8ff44866095 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -579,6 +579,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
+      MRI.reserveReg(Reg, TRI);
       return Reg;
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ddb5f719356855..4b5f9bdd82b8db 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2083,6 +2083,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
 
+  assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
+         "unreserved scratch RSRC register");
+
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
   int Index = MI->getOperand(FIOperandNum).getIndex();
 

From 1594cebedd60a08f408e3fa975116ef4db86bf9b Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Wed, 29 May 2024 11:42:06 +0200
Subject: [PATCH 091/230] [mlir][EmitC] Fix evaluation order of expressions
 (#93549)

Expressions with the same precedence were not parenthesized and
therefore were possibly evaluated in the wrong order depending on the
shape of the expression tree.

---------

Co-authored-by: Matthias Gehre <matthias.gehre@amd.com>
Co-authored-by: Corentin Ferry <corentin.ferry@amd.com>
---
 mlir/lib/Target/Cpp/TranslateToCpp.cpp |  6 +++++-
 mlir/test/Target/Cpp/expressions.mlir  | 23 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 7db7163bac4ab6..f19e0f8c4c2a42 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -1316,7 +1316,11 @@ LogicalResult CppEmitter::emitOperand(Value value) {
     FailureOr<int> precedence = getOperatorPrecedence(def);
     if (failed(precedence))
       return failure();
-    bool encloseInParenthesis = precedence.value() < getExpressionPrecedence();
+
+    // Sub-expressions with equal or lower precedence need to be parenthesized,
+    // as they might be evaluated in the wrong order depending on the shape of
+    // the expression tree.
+    bool encloseInParenthesis = precedence.value() <= getExpressionPrecedence();
     if (encloseInParenthesis) {
       os << "(";
       pushExpressionPrecedence(lowestPrecedence());
diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir
index 2eda58902cb1d1..aaddd5af874a91 100644
--- a/mlir/test/Target/Cpp/expressions.mlir
+++ b/mlir/test/Target/Cpp/expressions.mlir
@@ -65,15 +65,15 @@ func.func @do_not_inline(%arg0: i32, %arg1: i32, %arg2 : i32) -> i32 {
   return %e : i32
 }
 
-// CPP-DEFAULT:      float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DEFAULT:      float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
 // CPP-DEFAULT-NEXT:   return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]);
 // CPP-DEFAULT-NEXT: }
 
-// CPP-DECLTOP:      float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DECLTOP:      float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
 // CPP-DECLTOP-NEXT:   return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]);
 // CPP-DECLTOP-NEXT: }
 
-func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 {
+func.func @parentheses_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 {
   %e = emitc.expression : f32 {
     %a = emitc.add %arg0, %arg1 : (i32, i32) -> i32
     %b = emitc.mul %a, %arg2 : (i32, i32) -> i32
@@ -83,6 +83,23 @@ func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) ->
   return %e : f32
 }
 
+// CPP-DEFAULT:      int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DEFAULT-NEXT:   return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]);
+// CPP-DEFAULT-NEXT: }
+
+// CPP-DECLTOP:      int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DECLTOP-NEXT:   return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]);
+// CPP-DECLTOP-NEXT: }
+func.func @parentheses_for_same_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 {
+  %e = emitc.expression : i32 {
+      %0 = emitc.mul %arg0, %arg1 : (i32, i32) -> i32
+      %1 = emitc.div %arg2, %0 : (i32, i32) -> i32
+      emitc.yield %1 : i32
+    }
+
+  return %e : i32
+}
+
 // CPP-DEFAULT:      int32_t multiple_uses(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]], int32_t [[VAL_4:v[0-9]+]]) {
 // CPP-DEFAULT-NEXT:   bool [[VAL_5:v[0-9]+]] = bar([[VAL_1]] * [[VAL_2]], [[VAL_3]]) - [[VAL_4]] < [[VAL_2]];
 // CPP-DEFAULT-NEXT:   int32_t [[VAL_6:v[0-9]+]];

From 5553f27d5a45e702415fa2f91d842bf4a1f4a8b5 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Wed, 29 May 2024 17:42:41 +0800
Subject: [PATCH 092/230] [AMDGPU][test] Fix the wrong triples in
 lower-work-group-id-intrinsics-{hsa,pal}.ll. NFC (#93501)

- hsa -> amdhsa
- Use amdgcn-amd-amd{hsa,pal} for lower-work-group-id-intrinsics-{hsa,pal}.ll respectively
---
 .../lower-work-group-id-intrinsics-hsa.ll     | 170 +++++++-----------
 .../lower-work-group-id-intrinsics-pal.ll     |  65 +++----
 2 files changed, 88 insertions(+), 147 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 9547f08d3eba6b..1429251fc64211 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @workgroup_ids_kernel() {
 ; GFX9-LABEL: workgroup_ids_kernel:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -72,27 +72,20 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
 define amdgpu_kernel void @caller() {
 ; GFX9-SDAG-LABEL: caller:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s7
-; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-SDAG-NEXT:    s_add_u32 s8, s2, 36
-; GFX9-SDAG-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9-SDAG-NEXT:    s_getpc_b64 s[2:3]
-; GFX9-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
-; GFX9-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-SDAG-NEXT:    s_add_u32 flat_scratch_lo, s10, s13
+; GFX9-SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s13
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s6
-; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
@@ -100,27 +93,20 @@ define amdgpu_kernel void @caller() {
 ;
 ; GFX9-GISEL-LABEL: caller:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s7
-; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-GISEL-NEXT:    s_add_u32 s8, s2, 36
-; GFX9-GISEL-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 flat_scratch_lo, s10, s13
+; GFX9-GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s13
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-GISEL-NEXT:    s_mov_b32 s12, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[14:15]
@@ -128,81 +114,61 @@ define amdgpu_kernel void @caller() {
 ;
 ; GFX9ARCH-SDAG-LABEL: caller:
 ; GFX9ARCH-SDAG:       ; %bb.0:
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s38, -1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s36, s36, s6
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s2, 36
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[2:3]
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
-; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 flat_scratch_lo, s10, s12
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, s12
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
 ; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9ARCH-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9ARCH-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9ARCH-GISEL-LABEL: caller:
 ; GFX9ARCH-GISEL:       ; %bb.0:
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s38, -1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s36, s36, s6
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s2, 36
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 flat_scratch_lo, s10, s12
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, s12
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
 ; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9ARCH-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9ARCH-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: caller:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX12-SDAG-NEXT:    s_mov_b32 s7, callee@abs32@hi
-; GFX12-SDAG-NEXT:    s_mov_b32 s6, callee@abs32@lo
-; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX12-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: caller:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, callee@abs32@lo
-; GFX12-GISEL-NEXT:    s_mov_b32 s7, callee@abs32@hi
-; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX12-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: caller:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16
+; GFX12-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
+; GFX12-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-NEXT:    s_mov_b32 s32, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   call void @callee(i32 %idx) #0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 14fe4e5f48c67c..8009f917aef5a7 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
@@ -67,62 +67,37 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 }
 
 define amdgpu_cs void @caller() {
-; GFX9-LABEL: caller:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
-;
 ; GFX9ARCH-SDAG-LABEL: caller:
 ; GFX9ARCH-SDAG:       ; %bb.0:
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s10, -1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, s0
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, s0
 ; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9ARCH-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9ARCH-GISEL-LABEL: caller:
 ; GFX9ARCH-GISEL:       ; %bb.0:
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s10, -1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, s0
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, s0
 ; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9ARCH-GISEL-NEXT:    s_endpgm
 ;

From 78cc9cbba23fd1783a9b233ae745f126ece56cc7 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 29 May 2024 10:44:58 +0100
Subject: [PATCH 093/230] [AArch64][SME] Add intrinsics for multi-vector
 BFCLAMP (#93532)

According to the specification in
https://github.com/ARM-software/acle/pull/309 this adds the intrinsics

```
  svbfloat16x2_t svclamp[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zn,
                                        svbfloat16_t zm)  __arm_streaming;

  svbfloat16x4_t svclamp[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zn,
                                        svbfloat16_t zm)  __arm_streaming;
```
These are available only  if __ARM_FEATURE_SME_B16B16 is enabled.
---
 clang/include/clang/Basic/arm_sve.td          |  5 ++
 .../aarch64-sme2-intrinsics/acle_sme2_clamp.c | 74 +++++++++++++++++--
 .../acle_sme2_b16b16.cpp                      | 13 ++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  2 +
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  6 ++
 .../AArch64/sve2p1-intrinsics-bfclamp.ll      | 26 ++++++-
 6 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 03570f94de6666..078ef576342a7c 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in {
   def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "hfd",      MergeNone, "aarch64_sve_fclamp_single_x4",  [IsStreaming], []>;
 }
 
+let TargetGuard = "sme2,b16b16"in {
+  def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x2",  [IsStreaming], []>;
+  def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x4",  [IsStreaming], []>;
+}
+
 let TargetGuard = "sme2" in {
 // == ADD (vectors) ==
   def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 57ea4d2a1ac47a..21a8229bbf244e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
@@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf
 svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
 }
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
new file mode 100644
index 00000000000000..62a1f8e6de1d79
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
@@ -0,0 +1,13 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -emit-llvm-only -verify -verify-ignore-unexpected=error,note -o - %s
+
+#include <arm_sme.h>
+
+void test_b16b16( svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4) __arm_streaming
+{
+  // expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature sme2,b16b16}}
+  svclamp_single_bf16_x2(bf16x2, bf16, bf16);
+  // expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature sme2,b16b16}}
+  svclamp_single_bf16_x4(bf16x4, bf16, bf16);
+}
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 4544cf35fb7b37..57d0dfb698b383 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3472,10 +3472,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
 
   def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
 
   //
   // Multi-vector add/sub and accumulate into ZA
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 25f2e4d7c4de63..660675cf8f3895 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5738,6 +5738,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG2_2Z2Z_D}))
         SelectClamp(Node, 2, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x2:
+      SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x4:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
@@ -5759,6 +5762,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG4_4Z4Z_D}))
         SelectClamp(Node, 4, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x4:
+      SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_add_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
index 61b67755a35441..7934f831a7e62f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
 ; CHECK-LABEL: bfclamp:
@@ -11,3 +11,27 @@ define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bf
 }
 
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x2_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d){
+; CHECK-LABEL: test_bfclamp_single_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    bfclamp { z0.h, z1.h }, z2.h, z3.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x4_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f){
+; CHECK-LABEL: test_bfclamp_single_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    bfclamp { z0.h - z3.h }, z4.h, z5.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}

From e1aa8ad6faa1524f12338ca58d1eadfde6f29f34 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 29 May 2024 11:58:59 +0200
Subject: [PATCH 094/230] [flang][OpenMP] Fix bug in emitting `dealloc` logic
 (#93641)

Fixes a bug in emiting deacllocation logic when delayed privatization is
disabled. I introduced the bug when implementing delayed privatization
for allocatables: when delayed privatization is disabled the
deacllocation ops are emitted for only one allocatable variables.
---
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |  2 +-
 .../OpenMP/allocatable-multiple-vars.f90      | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/allocatable-multiple-vars.f90

diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index b722e19272ca11..557a9685024c5e 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -86,7 +86,7 @@ void DataSharingProcessor::insertDeallocs() {
     if (semantics::IsAllocatable(sym->GetUltimate())) {
       if (!useDelayedPrivatization) {
         converter.createHostAssociateVarCloneDealloc(*sym);
-        return;
+        continue;
       }
 
       lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
diff --git a/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90
new file mode 100644
index 00000000000000..e6450a13e13a05
--- /dev/null
+++ b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90
@@ -0,0 +1,28 @@
+! Test early privatization for multiple allocatable variables.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization=false \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization=false -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1, var2
+
+!$omp parallel private(var1, var2)
+  var1 = 10
+  var2 = 20
+!$omp end parallel
+end subroutine
+
+! Verify that private versions of each variable are both allocated and freed
+! within the parallel region.
+
+! CHECK:      omp.parallel {
+! CHECK:        fir.allocmem
+! CHECK:        fir.allocmem
+! CHECK:        fir.freemem
+! CHECK:        fir.freemem
+! CHECK:        omp.terminator
+! CHECK-NEXT: }

From 5c214eb0c628c874f2c9496e663be4067e64442a Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Wed, 29 May 2024 12:05:05 +0200
Subject: [PATCH 095/230] [Inline] Clone return range attribute on the callsite
 into inlined call (#92666)

---
 clang/test/Headers/__clang_hip_math.hip       |  6 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  | 13 +++-
 .../Inline/ret_attr_align_and_noundef.ll      | 73 +++++++++++++++++++
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 1271868a53b866..26da82843c5124 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -231,7 +231,7 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
 
 // CHECK-LABEL: @test_abs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 extern "C" __device__ int test_abs(int x) {
@@ -240,7 +240,7 @@ extern "C" __device__ int test_abs(int x) {
 
 // CHECK-LABEL: @test_labs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long test_labs(long x) {
@@ -249,7 +249,7 @@ extern "C" __device__ long test_labs(long x) {
 
 // CHECK-LABEL: @test_llabs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long long test_llabs(long x) {
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7b846f2d2d72d6..eb471b259c7d4e 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -30,11 +30,12 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -1450,6 +1451,8 @@ static AttrBuilder IdentifyValidPoisonGeneratingAttributes(CallBase &CB) {
     Valid.addAttribute(Attribute::NonNull);
   if (CB.hasRetAttr(Attribute::Alignment))
     Valid.addAlignmentAttr(CB.getRetAlign());
+  if (std::optional<ConstantRange> Range = CB.getRange())
+    Valid.addRangeAttr(*Range);
   return Valid;
 }
 
@@ -1541,6 +1544,14 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     if (ValidPG.getAlignment().valueOrOne() < AL.getRetAlignment().valueOrOne())
       ValidPG.removeAttribute(Attribute::Alignment);
     if (ValidPG.hasAttributes()) {
+      Attribute CBRange = ValidPG.getAttribute(Attribute::Range);
+      if (CBRange.isValid()) {
+        Attribute NewRange = AL.getRetAttr(Attribute::Range);
+        if (NewRange.isValid()) {
+          ValidPG.addRangeAttr(
+              CBRange.getRange().intersectWith(NewRange.getRange()));
+        }
+      }
       // Three checks.
       // If the callsite has `noundef`, then a poison due to violating the
       // return attribute will create UB anyways so we can always propagate.
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index c038ffccf3e96d..f4cebf1fcb5da0 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -5,10 +5,12 @@
 
 declare ptr @foo()
 declare void @use.ptr(ptr) willreturn nounwind
+declare void @use.val(i8) willreturn nounwind
 declare void @bar()
 declare void @baz()
 declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
 declare i1 @val()
+declare i8 @val8()
 
 define ptr @callee0123() {
 ; CHECK-LABEL: define ptr @callee0123() {
@@ -337,3 +339,74 @@ define ptr @caller12_todo() {
   %r = call nonnull ptr @callee12()
   ret ptr %r
 }
+
+define i8 @callee13() {
+; CHECK-LABEL: define i8 @callee13() {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller13_okay_use_after_poison_anyways() {
+; CHECK-LABEL: define i8 @caller13_okay_use_after_poison_anyways() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 0, 10) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 10) i8 @callee13()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @callee14() {
+; CHECK-LABEL: define i8 @callee14() {
+; CHECK-NEXT:    [[R:%.*]] = call noundef i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call noundef i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller14_fail_creates_ub() {
+; CHECK-LABEL: define i8 @caller14_fail_creates_ub() {
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 10) i8 @callee14()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @caller14_okay_is_ub_anyways() {
+; CHECK-LABEL: define i8 @caller14_okay_is_ub_anyways() {
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef range(i8 0, 10) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call noundef range(i8 0, 10) i8 @callee14()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @callee15() {
+; CHECK-LABEL: define i8 @callee15() {
+; CHECK-NEXT:    [[R:%.*]] = call range(i8 5, 10) i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call range(i8 5, 10) i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller15_okay_intersect_ranges() {
+; CHECK-LABEL: define i8 @caller15_okay_intersect_ranges() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 5, 7) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 7) i8 @callee15()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}

From 971f1aaad3ca3680bfbab76212f498ca15b280a2 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 29 May 2024 10:05:43 +0000
Subject: [PATCH 096/230] [lldb][Test][Windows] Fix flaky address range API
 tests

The new tests added in #92014 have been flaky on Linaro's
Windows on Arm bot. They appear to be hitting a deadlock trying
to clean up the test process.

This only happens in async mode and I don't see why this test
case needs async mode, so the simple workaround is to stick to
sync mode.
---
 lldb/test/API/python_api/address_range/TestAddressRange.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lldb/test/API/python_api/address_range/TestAddressRange.py b/lldb/test/API/python_api/address_range/TestAddressRange.py
index 8c27558af4752d..65221e3f1b0e91 100644
--- a/lldb/test/API/python_api/address_range/TestAddressRange.py
+++ b/lldb/test/API/python_api/address_range/TestAddressRange.py
@@ -15,8 +15,6 @@ def setUp(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
 
-        self.dbg.SetAsync(True)
-
         self.target = self.dbg.CreateTarget(exe)
         self.assertTrue(self.target, VALID_TARGET)
         self.launch_info = self.target.GetLaunchInfo()

From 3bcccb6af685c3132a9ee578b9e11b2503c35a5c Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 29 May 2024 18:09:23 +0800
Subject: [PATCH 097/230] [Reassociate] Drop weight reduction to fix issue
 91417 (#91469)

See the following case: https://alive2.llvm.org/ce/z/A-fBki
```
define i3 @src(i3 %0) {
  %2 = mul i3 %0, %0
  %3 = mul i3 %2, %0
  %4 = mul i3 %3, %0
  %5 = mul nsw i3 %4, %0
  ret i3 %5
}

define i3 @tgt(i3 %0) {
  %2 = mul i3 %0, %0
  %5 = mul nsw i3 %2, %0
  ret i3 %5
}
```


https://github.com/llvm/llvm-project/commit/d7aeefebd6b049f017711cd7c6ef5f217a17b673
introduced weight reduction during weights combination of the same
operand. As the weight of `%0` changes from 5 to 3, the nsw flag in `%5`
should be dropped.

However, the nsw flag isn't cleared by `RewriteExprTree` since `%5 = mul
nsw i3 %0, %4` is not included in the range of `[ExpressionChangedStart,
ExpressionChangedEnd)`.
```
Calculated Rank[] = 3
Combine negations for:   %2 = mul i3 %0, %0
Calculated Rank[] = 4
Combine negations for:   %3 = mul i3 %0, %2
Calculated Rank[] = 5
Combine negations for:   %4 = mul i3 %0, %3
Calculated Rank[] = 6
Combine negations for:   %5 = mul nsw i3 %0, %4
LINEARIZE:   %5 = mul nsw i3 %0, %4
OPERAND: i3 %0 (1)
ADD USES LEAF: i3 %0 (1)
OPERAND:   %4 = mul i3 %0, %3 (1)
DIRECT ADD:   %4 = mul i3 %0, %3 (1)
OPERAND: i3 %0 (1)
OPERAND:   %3 = mul i3 %0, %2 (1)
DIRECT ADD:   %3 = mul i3 %0, %2 (1)
OPERAND: i3 %0 (1)
OPERAND:   %2 = mul i3 %0, %0 (1)
DIRECT ADD:   %2 = mul i3 %0, %0 (1)
OPERAND: i3 %0 (1)
OPERAND: i3 %0 (1)
RAIn:   mul i3  [ %0, #3] [ %0, #3] [ %0, #3]
RAOut:  mul i3  [ %0, #3] [ %0, #3] [ %0, #3]
RAOut after CSE reorder:        mul i3  [ %0, #3] [ %0, #3] [ %0, #3]
RA:   %5 = mul nsw i3 %0, %4
TO:   %5 = mul nsw i3 %4, %0
RA:   %4 = mul i3 %0, %3
TO:   %4 = mul i3 %0, %0
```

The best way to fix this is to inform `RewriteExprTree` to clear flags
of the whole expr tree when weight reduction happens.

But I find that weight reduction based on Carmichael number never
happens in practice.
See the coverage result
https://dtcxzyw.github.io/llvm-opt-benchmark/coverage/home/dtcxzyw/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp.html#L323

I think it would be better to drop `IncorporateWeight`.

Fixes #91417
---
 llvm/lib/Transforms/Scalar/Reassociate.cpp  | 112 +-----------
 llvm/test/Transforms/Reassociate/repeats.ll | 187 +++++++++++++-------
 2 files changed, 126 insertions(+), 173 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index c903e47a93cafd..04c54ed69e93f1 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -302,97 +302,6 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   return Res;
 }
 
-/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
-/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
-/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
-/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
-/// even x in Bitwidth-bit arithmetic.
-static unsigned CarmichaelShift(unsigned Bitwidth) {
-  if (Bitwidth < 3)
-    return Bitwidth - 1;
-  return Bitwidth - 2;
-}
-
-/// Add the extra weight 'RHS' to the existing weight 'LHS',
-/// reducing the combined weight using any special properties of the operation.
-/// The existing weight LHS represents the computation X op X op ... op X where
-/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
-/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
-/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
-/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
-static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
-  // If we were working with infinite precision arithmetic then the combined
-  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
-  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
-  // for nilpotent operations and addition, but not for idempotent operations
-  // and multiplication), so it is important to correctly reduce the combined
-  // weight back into range if wrapping would be wrong.
-
-  // If RHS is zero then the weight didn't change.
-  if (RHS.isMinValue())
-    return;
-  // If LHS is zero then the combined weight is RHS.
-  if (LHS.isMinValue()) {
-    LHS = RHS;
-    return;
-  }
-  // From this point on we know that neither LHS nor RHS is zero.
-
-  if (Instruction::isIdempotent(Opcode)) {
-    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
-    // weight of 1.  Keeping weights at zero or one also means that wrapping is
-    // not a problem.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    return; // Return a weight of 1.
-  }
-  if (Instruction::isNilpotent(Opcode)) {
-    // Nilpotent means X op X === 0, so reduce weights modulo 2.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    LHS = 0; // 1 + 1 === 0 modulo 2.
-    return;
-  }
-  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
-    // TODO: Reduce the weight by exploiting nsw/nuw?
-    LHS += RHS;
-    return;
-  }
-
-  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
-         "Unknown associative operation!");
-  unsigned Bitwidth = LHS.getBitWidth();
-  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
-  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
-  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
-  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
-  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
-  // which by a happy accident means that they can always be represented using
-  // Bitwidth bits.
-  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
-  // the Carmichael number).
-  if (Bitwidth > 3) {
-    /// CM - The value of Carmichael's lambda function.
-    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
-    // Any weight W >= Threshold can be replaced with W - CM.
-    APInt Threshold = CM + Bitwidth;
-    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
-    // For Bitwidth 4 or more the following sum does not overflow.
-    LHS += RHS;
-    while (LHS.uge(Threshold))
-      LHS -= CM;
-  } else {
-    // To avoid problems with overflow do everything the same as above but using
-    // a larger type.
-    unsigned CM = 1U << CarmichaelShift(Bitwidth);
-    unsigned Threshold = CM + Bitwidth;
-    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
-           "Weights not reduced!");
-    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
-    while (Total >= Threshold)
-      Total -= CM;
-    LHS = Total;
-  }
-}
-
 using RepeatedValue = std::pair<Value*, APInt>;
 
 /// Given an associative binary expression, return the leaf
@@ -562,26 +471,7 @@ static bool LinearizeExprTree(Instruction *I,
                "In leaf map but not visited!");
 
         // Update the number of paths to the leaf.
-        IncorporateWeight(It->second, Weight, Opcode);
-
-#if 0   // TODO: Re-enable once PR13021 is fixed.
-        // The leaf already has one use from inside the expression.  As we want
-        // exactly one such use, drop this new use of the leaf.
-        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
-        I->setOperand(OpIdx, UndefValue::get(I->getType()));
-        Changed = true;
-
-        // If the leaf is a binary operation of the right kind and we now see
-        // that its multiple original uses were in fact all by nodes belonging
-        // to the expression, then no longer consider it to be a leaf and add
-        // its operands to the expression.
-        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
-          LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
-          Worklist.push_back(std::make_pair(BO, It->second));
-          Leaves.erase(It);
-          continue;
-        }
-#endif
+        It->second += Weight;
 
         // If we still have uses that are not accounted for by the expression
         // then it is not safe to modify the value.
diff --git a/llvm/test/Transforms/Reassociate/repeats.ll b/llvm/test/Transforms/Reassociate/repeats.ll
index c18db19fa73e35..28177f1c0ba5ee 100644
--- a/llvm/test/Transforms/Reassociate/repeats.ll
+++ b/llvm/test/Transforms/Reassociate/repeats.ll
@@ -1,56 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=reassociate -S | FileCheck %s
 
 ; Tests involving repeated operations on the same value.
 
 define i8 @nilpotent(i8 %x) {
-; CHECK-LABEL: @nilpotent(
+; CHECK-LABEL: define i8 @nilpotent(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    ret i8 0
+;
   %tmp = xor i8 %x, %x
   ret i8 %tmp
-; CHECK: ret i8 0
 }
 
 define i2 @idempotent(i2 %x) {
-; CHECK-LABEL: @idempotent(
+; CHECK-LABEL: define i2 @idempotent(
+; CHECK-SAME: i2 [[X:%.*]]) {
+; CHECK-NEXT:    ret i2 -1
+;
   %tmp1 = and i2 %x, %x
   %tmp2 = and i2 %tmp1, %x
   %tmp3 = and i2 %tmp2, %x
   ret i2 %tmp3
-; CHECK: ret i2 %x
 }
 
 define i2 @add(i2 %x) {
-; CHECK-LABEL: @add(
+; CHECK-LABEL: define i2 @add(
+; CHECK-SAME: i2 [[X:%.*]]) {
+; CHECK-NEXT:    ret i2 0
+;
   %tmp1 = add i2 %x, %x
   %tmp2 = add i2 %tmp1, %x
   %tmp3 = add i2 %tmp2, %x
   ret i2 %tmp3
-; CHECK: ret i2 0
 }
 
 define i2 @cst_add() {
-; CHECK-LABEL: @cst_add(
+; CHECK-LABEL: define i2 @cst_add() {
+; CHECK-NEXT:    ret i2 -1
+;
   %tmp1 = add i2 1, 1
   %tmp2 = add i2 %tmp1, 1
   ret i2 %tmp2
-; CHECK: ret i2 -1
 }
 
 define i8 @cst_mul() {
-; CHECK-LABEL: @cst_mul(
+; CHECK-LABEL: define i8 @cst_mul() {
+; CHECK-NEXT:    ret i8 -13
+;
   %tmp1 = mul i8 3, 3
   %tmp2 = mul i8 %tmp1, 3
   %tmp3 = mul i8 %tmp2, 3
   %tmp4 = mul i8 %tmp3, 3
   ret i8 %tmp4
-; CHECK: ret i8 -13
 }
 
 define i3 @foo3x5(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x5(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x5(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP5]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -58,12 +70,31 @@ define i3 @foo3x5(i3 %x) {
   ret i3 %tmp4
 }
 
+define i3 @foo3x5_nsw(i3 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: define i3 @foo3x5_nsw(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP4]]
+;
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul nsw i3 %tmp3, %x
+  ret i3 %tmp4
+}
+
 define i3 @foo3x6(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x6(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x6(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP2]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -74,10 +105,14 @@ define i3 @foo3x6(i3 %x) {
 
 define i3 @foo3x7(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x7(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x7(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i3 [[TMP5]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP7]], [[X]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i3 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    ret i3 [[TMP6]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -89,10 +124,13 @@ define i3 @foo3x7(i3 %x) {
 
 define i4 @foo4x8(i4 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo4x8(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x8(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP4]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -105,11 +143,14 @@ define i4 @foo4x8(i4 %x) {
 
 define i4 @foo4x9(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x9(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x9(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i4 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP8]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -123,11 +164,14 @@ define i4 @foo4x9(i4 %x) {
 
 define i4 @foo4x10(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x10(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x10(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP3]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -142,12 +186,15 @@ define i4 @foo4x10(i4 %x) {
 
 define i4 @foo4x11(i4 %x) {
 ; Can be done with four multiplies.
-; CHECK-LABEL: @foo4x11(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x11(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i4 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP10]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -163,10 +210,14 @@ define i4 @foo4x11(i4 %x) {
 
 define i4 @foo4x12(i4 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo4x12(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x12(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP2]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -183,11 +234,15 @@ define i4 @foo4x12(i4 %x) {
 
 define i4 @foo4x13(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x13(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x13(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i4 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP12]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -205,11 +260,15 @@ define i4 @foo4x13(i4 %x) {
 
 define i4 @foo4x14(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x14(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x14(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP5]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i4 [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    ret i4 [[TMP7]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -228,12 +287,16 @@ define i4 @foo4x14(i4 %x) {
 
 define i4 @foo4x15(i4 %x) {
 ; Can be done with four multiplies.
-; CHECK-LABEL: @foo4x15(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x15(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP6]], [[X]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i4 [[TMP14]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x

From 718ba5a58452f013f40fab94f967064919bf13ff Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234@gmail.com>
Date: Sun, 26 May 2024 17:28:29 -0400
Subject: [PATCH 098/230] Reapply [InstCombine] lshr (mul (X, 2^N + 1)), N ->
 add (X, lshr(X, N)) (#92907)

Alive2 Proofs:
https://alive2.llvm.org/ce/z/eSinJY
https://alive2.llvm.org/ce/z/vyKvde
https://alive2.llvm.org/ce/z/dRFsfV

I mistakenly reverted this commit as part of a larger set of
reverts. Reapplied without changes.
---
 .../InstCombine/InstCombineShifts.cpp         |  50 +++-
 llvm/test/Transforms/InstCombine/ashr-lshr.ll | 259 ++++++++++++++++++
 llvm/test/Transforms/InstCombine/lshr.ll      |  19 +-
 3 files changed, 318 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 0f1979fbe0c769..4f91993750fd27 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1461,13 +1461,24 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
 
     const APInt *MulC;
     if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC)))) {
-      // Look for a "splat" mul pattern - it replicates bits across each half of
-      // a value, so a right shift is just a mask of the low bits:
-      // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
-      // TODO: Generalize to allow more than just half-width shifts?
-      if (BitWidth > 2 && ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
-          MulC->logBase2() == ShAmtC)
-        return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+      if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+          MulC->logBase2() == ShAmtC) {
+        // Look for a "splat" mul pattern - it replicates bits across each half
+        // of a value, so a right shift is just a mask of the low bits:
+        // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
+        if (ShAmtC * 2 == BitWidth)
+          return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+
+        // lshr (mul nuw (X, 2^N + 1)), N -> add nuw (X, lshr(X, N))
+        if (Op0->hasOneUse()) {
+          auto *NewAdd = BinaryOperator::CreateNUWAdd(
+              X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
+                                    I.isExact()));
+          NewAdd->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap());
+          return NewAdd;
+        }
+      }
 
       // The one-use check is not strictly necessary, but codegen may not be
       // able to invert the transform and perf may suffer with an extra mul
@@ -1487,6 +1498,16 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
+    // lshr (mul nsw (X, 2^N + 1)), N -> add nsw (X, lshr(X, N))
+    if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC))))) {
+      if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+          MulC->logBase2() == ShAmtC) {
+        return BinaryOperator::CreateNSWAdd(
+            X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
+                                  I.isExact()));
+      }
+    }
+
     // Try to narrow bswap.
     // In the case where the shift amount equals the bitwidth difference, the
     // shift is eliminated.
@@ -1690,6 +1711,21 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
       if (match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
         return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
     }
+
+    const APInt *MulC;
+    if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC)))) &&
+        (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+         MulC->logBase2() == ShAmt &&
+         (ShAmt < BitWidth - 1))) /* Minus 1 for the sign bit */ {
+
+      // ashr (mul nsw (X, 2^N + 1)), N -> add nsw (X, ashr(X, N))
+      auto *NewAdd = BinaryOperator::CreateNSWAdd(
+          X,
+          Builder.CreateAShr(X, ConstantInt::get(Ty, ShAmt), "", I.isExact()));
+      NewAdd->setHasNoUnsignedWrap(
+          cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap());
+      return NewAdd;
+    }
   }
 
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
index ac206dc7999dd2..c2a4f35412670b 100644
--- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
@@ -604,3 +604,262 @@ define <2 x i8> @ashr_known_pos_exact_vec(<2 x i8> %x, <2 x i8> %y) {
   %r = ashr exact <2 x i8> %p, %y
   ret <2 x i8> %r
 }
+
+define i32 @lshr_mul_times_3_div_2(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw nuw i32 %0, 3
+  %lshr = lshr i32 %mul, 1
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_3_div_2_exact(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %lshr = lshr exact i32 %mul, 1
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @lshr_mul_times_3_div_2_no_flags(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul i32 %0, 3
+  %lshr = lshr i32 %mul, 1
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @mul_times_3_div_2_multiuse_lshr(i32 %x) {
+; CHECK-LABEL: @mul_times_3_div_2_multiuse_lshr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[MUL]], 1
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nuw i32 %x, 3
+  %res = lshr i32 %mul, 1
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nuw i32 %x, 3
+  %lshr = lshr exact i32 %mul, 1
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_5_div_4(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw nuw i32 %0, 5
+  %lshr = lshr i32 %mul, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_5_div_4_exact(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %lshr = lshr exact i32 %mul, 2
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @lshr_mul_times_5_div_4_no_flags(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[MUL]], 2
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul i32 %0, 5
+  %lshr = lshr i32 %mul, 2
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @mul_times_5_div_4_multiuse_lshr(i32 %x) {
+; CHECK-LABEL: @mul_times_5_div_4_multiuse_lshr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 5
+; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[MUL]], 2
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nuw i32 %x, 5
+  %res = lshr i32 %mul, 2
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nuw i32 %x, 5
+  %lshr = lshr exact i32 %mul, 2
+  ret i32 %lshr
+}
+
+define i32 @ashr_mul_times_3_div_2(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2(
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw nsw i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_3_div_2_exact(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %ashr = ashr exact i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_3_div_2_no_flags(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_3_div_2_no_nsw(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_no_nsw(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @mul_times_3_div_2_multiuse_ashr(i32 %x) {
+; CHECK-LABEL: @mul_times_3_div_2_multiuse_ashr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nsw i32 %x, 3
+  %res = ashr i32 %mul, 1
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %ashr = ashr exact i32 %mul, 1
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_5_div_4(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4(
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw nsw i32 %0, 5
+  %ashr = ashr i32 %mul, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_5_div_4_exact(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %ashr = ashr exact i32 %mul, 2
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_5_div_4_no_flags(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 2
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul i32 %0, 5
+  %ashr = ashr i32 %mul, 2
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @mul_times_5_div_4_multiuse_ashr(i32 %x) {
+; CHECK-LABEL: @mul_times_5_div_4_multiuse_ashr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 5
+; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[MUL]], 2
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nsw i32 %x, 5
+  %res = ashr i32 %mul, 2
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %ashr = ashr exact i32 %mul, 2
+  ret i32 %ashr
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
index fa92c1c4b3be4b..dfdb6c7b4b2689 100644
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -628,12 +628,12 @@ define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) {
   ret i32 %t
 }
 
-; Negative test
+; Negative test (but simplifies into a different transform)
 
 define i32 @mul_splat_fold_no_nuw(i32 %x) {
 ; CHECK-LABEL: @mul_splat_fold_no_nuw(
-; CHECK-NEXT:    [[M:%.*]] = mul nsw i32 [[X:%.*]], 65537
-; CHECK-NEXT:    [[T:%.*]] = lshr i32 [[M]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[T:%.*]] = add nsw i32 [[TMP1]], [[X]]
 ; CHECK-NEXT:    ret i32 [[T]]
 ;
   %m = mul nsw i32 %x, 65537
@@ -641,6 +641,19 @@ define i32 @mul_splat_fold_no_nuw(i32 %x) {
   ret i32 %t
 }
 
+; Negative test 
+
+define i32 @mul_splat_fold_no_flags(i32 %x) {
+; CHECK-LABEL: @mul_splat_fold_no_flags(
+; CHECK-NEXT:    [[M:%.*]] = mul i32 [[X:%.*]], 65537
+; CHECK-NEXT:    [[T:%.*]] = lshr i32 [[M]], 16
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %m = mul i32 %x, 65537
+  %t = lshr i32 %m, 16
+  ret i32 %t
+}
+
 ; Negative test (but simplifies before we reach the mul_splat transform)- need more than 2 bits
 
 define i2 @mul_splat_fold_too_narrow(i2 %x) {

From 6543453c3604c5532666a9bad2bf3d261099dab5 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Wed, 29 May 2024 09:05:09 +0000
Subject: [PATCH 099/230] [AArch64][NFC] Pre-commit test update for Select
 TBL/TBX instructions (#92914)

---
 llvm/test/CodeGen/AArch64/arm64-tbl.ll | 28 +++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index b89232c03f1363..96b2af7274b5bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -1,5 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for tbl2_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl2_16b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_16b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_16b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_16b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_16b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_8b
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_16b
 
 define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
 ; CHECK-LABEL: tbl1_8b:
@@ -571,3 +594,6 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>,
 declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
 declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}

From aef0bdd36d888edd1575713e4976162daf81ff5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 29 May 2024 12:26:27 +0200
Subject: [PATCH 100/230] DAG: Preserve flags when expanding fminimum/fmaximum
 (#93550)

The operation selection logic here doesn't really work when vector types
need to be split. This was also dropping the flags, and losing nnan made
the combine from select back to fmin/fmax unrecoverable. Preserve the
flags to assist a future commit.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h         |  4 ++--
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 96a62706904686..0dc237301abb48 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1241,11 +1241,11 @@ class SelectionDAG {
   /// Helper function to make it easier to build Select's if you just have
   /// operands and don't want to check for vector.
   SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
-                    SDValue RHS) {
+                    SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) {
     assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
            "Cannot use select on differing types");
     auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
-    return getNode(Opcode, DL, VT, Cond, LHS, RHS);
+    return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags);
   }
 
   /// Helper function to make it easier to build SelectCC's if you just have an
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e47f50ee42894..623b6343994a41 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8428,6 +8428,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   bool IsMax = Opc == ISD::FMAXIMUM;
+  SDNodeFlags Flags = N->getFlags();
 
   if (VT.isVector() &&
       isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
@@ -8444,15 +8445,15 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   bool MinMaxMustRespectOrderedZero = false;
 
   if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
-    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS, Flags);
     MinMaxMustRespectOrderedZero = true;
   } else if (isOperationLegalOrCustom(CompOpc, VT)) {
-    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
+    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
   } else {
     // NaN (if exists) will be propagated later, so orderness doesn't matter.
     SDValue Compare =
         DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
-    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS);
+    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS, Flags);
   }
 
   // Propagate any NaN of both operands
@@ -8461,7 +8462,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
     ConstantFP *FPNaN = ConstantFP::get(
         *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)));
     MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO),
-                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax);
+                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax, Flags);
   }
 
   // fminimum/fmaximum requires -0.0 less than +0.0
@@ -8473,11 +8474,11 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
         DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
     SDValue LCmp = DAG.getSelect(
         DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
-        MinMax);
+        MinMax, Flags);
     SDValue RCmp = DAG.getSelect(
         DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS,
-        LCmp);
-    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax);
+        LCmp, Flags);
+    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
   }
 
   return MinMax;

From 9e8ecce88ef65a2953db8071746720dd78bd1632 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 29 May 2024 18:26:54 +0800
Subject: [PATCH 101/230] [DAGCombine] Transform `shl X, cttz(Y)` to `mul (Y &
 -Y), X` if cttz is unsupported (#85066)

This patch fold `shl X, cttz(Y)` to `mul (Y & -Y), X` if cttz is
unsupported by the target.
Alive2: https://alive2.llvm.org/ce/z/AtLN5Y
Fixes https://github.com/llvm/llvm-project/issues/84763.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  12 +
 llvm/test/CodeGen/RISCV/shl-cttz.ll           | 807 ++++++++++++++++++
 2 files changed, 819 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/shl-cttz.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2f4fdf5208d076..42e861e61201c2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10107,6 +10107,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
+  // fold (shl X, cttz(Y)) -> (mul (Y & -Y), X) if cttz is unsupported on the
+  // target.
+  if ((N1.getOpcode() == ISD::CTTZ || N1.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
+      N1.hasOneUse() && !TLI.isOperationLegalOrCustom(ISD::CTTZ, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::MUL, VT)) {
+    SDValue Y = N1.getOperand(0);
+    SDLoc DL(N);
+    SDValue NegY = DAG.getNegative(Y, DL, VT);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Y, NegY);
+    return DAG.getNode(ISD::MUL, DL, VT, And, N0);
+  }
+
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
new file mode 100644
index 00000000000000..0eeb8b04c7e5d5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+m < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zbb < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+m < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I,RV64IILLEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBILLEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-experimental-rv64-legal-i32 < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I,RV64ILEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb -riscv-experimental-rv64-legal-i32 < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBLEGALI32
+
+define i8 @shl_cttz_i8(i8 %x, i8 %y) {
+; RV32I-LABEL: shl_cttz_i8:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a2, a1, -1
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    andi a2, a2, 85
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    andi a2, a1, 51
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 51
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i8:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_i8:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a2, a1, -1
+; RV64IILLEGALI32-NEXT:    not a1, a1
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 1
+; RV64IILLEGALI32-NEXT:    andi a2, a2, 85
+; RV64IILLEGALI32-NEXT:    subw a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a2, a1, 51
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 2
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 51
+; RV64IILLEGALI32-NEXT:    add a1, a2, a1
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 4
+; RV64IILLEGALI32-NEXT:    add a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 15
+; RV64IILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_i8:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a1, a1
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_i8:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a2, a1, -1
+; RV64ILEGALI32-NEXT:    not a1, a1
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 1
+; RV64ILEGALI32-NEXT:    andi a2, a2, 85
+; RV64ILEGALI32-NEXT:    subw a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a2, a1, 51
+; RV64ILEGALI32-NEXT:    srliw a1, a1, 2
+; RV64ILEGALI32-NEXT:    andi a1, a1, 51
+; RV64ILEGALI32-NEXT:    add a1, a2, a1
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 4
+; RV64ILEGALI32-NEXT:    add a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a1, a1, 15
+; RV64ILEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_i8:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a1, a1
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true)
+  %res = shl i8 %x, %cttz
+  ret i8 %res
+}
+
+define i8 @shl_cttz_constant_i8(i8 %y) {
+; RV32I-LABEL: shl_cttz_constant_i8:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    li a1, 4
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i8:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a1, a0, -1
+; RV64IILLEGALI32-NEXT:    not a0, a0
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 1
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 85
+; RV64IILLEGALI32-NEXT:    subw a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a1, a0, 51
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 2
+; RV64IILLEGALI32-NEXT:    andi a0, a0, 51
+; RV64IILLEGALI32-NEXT:    add a0, a1, a0
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 4
+; RV64IILLEGALI32-NEXT:    add a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a0, a0, 15
+; RV64IILLEGALI32-NEXT:    li a1, 4
+; RV64IILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a0, a0
+; RV64ZBBILLEGALI32-NEXT:    li a1, 4
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a1, a0, -1
+; RV64ILEGALI32-NEXT:    not a0, a0
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 1
+; RV64ILEGALI32-NEXT:    andi a1, a1, 85
+; RV64ILEGALI32-NEXT:    subw a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a1, a0, 51
+; RV64ILEGALI32-NEXT:    srliw a0, a0, 2
+; RV64ILEGALI32-NEXT:    andi a0, a0, 51
+; RV64ILEGALI32-NEXT:    add a0, a1, a0
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 4
+; RV64ILEGALI32-NEXT:    add a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a0, a0, 15
+; RV64ILEGALI32-NEXT:    li a1, 4
+; RV64ILEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a0, a0
+; RV64ZBBLEGALI32-NEXT:    li a1, 4
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true)
+  %res = shl i8 4, %cttz
+  ret i8 %res
+}
+
+define i16 @shl_cttz_i16(i16 %x, i16 %y) {
+; RV32I-LABEL: shl_cttz_i16:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a2, a1, -1
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    lui a3, 5
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a3, a1, a2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    andi a2, a1, 15
+; RV32I-NEXT:    slli a1, a1, 20
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i16:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_i16:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a2, a1, -1
+; RV64IILLEGALI32-NEXT:    not a1, a1
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 1
+; RV64IILLEGALI32-NEXT:    lui a3, 5
+; RV64IILLEGALI32-NEXT:    addiw a3, a3, 1365
+; RV64IILLEGALI32-NEXT:    and a2, a2, a3
+; RV64IILLEGALI32-NEXT:    sub a1, a1, a2
+; RV64IILLEGALI32-NEXT:    lui a2, 3
+; RV64IILLEGALI32-NEXT:    addiw a2, a2, 819
+; RV64IILLEGALI32-NEXT:    and a3, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 2
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    add a1, a3, a1
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 4
+; RV64IILLEGALI32-NEXT:    add a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a2, a1, 15
+; RV64IILLEGALI32-NEXT:    slli a1, a1, 52
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 60
+; RV64IILLEGALI32-NEXT:    add a1, a2, a1
+; RV64IILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_i16:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a1, a1
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_i16:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a2, a1, -1
+; RV64ILEGALI32-NEXT:    not a1, a1
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 1
+; RV64ILEGALI32-NEXT:    lui a3, 5
+; RV64ILEGALI32-NEXT:    addi a3, a3, 1365
+; RV64ILEGALI32-NEXT:    and a2, a2, a3
+; RV64ILEGALI32-NEXT:    subw a1, a1, a2
+; RV64ILEGALI32-NEXT:    lui a2, 3
+; RV64ILEGALI32-NEXT:    addi a2, a2, 819
+; RV64ILEGALI32-NEXT:    and a3, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a1, a1, 2
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    add a1, a3, a1
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 4
+; RV64ILEGALI32-NEXT:    add a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a2, a1, 15
+; RV64ILEGALI32-NEXT:    slli a1, a1, 52
+; RV64ILEGALI32-NEXT:    srli a1, a1, 60
+; RV64ILEGALI32-NEXT:    add a1, a2, a1
+; RV64ILEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_i16:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a1, a1
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true)
+  %res = shl i16 %x, %cttz
+  ret i16 %res
+}
+
+define i16 @shl_cttz_constant_i16(i16 %y) {
+; RV32I-LABEL: shl_cttz_constant_i16:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 3
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    li a1, 4
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i16:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a1, a0, -1
+; RV64IILLEGALI32-NEXT:    not a0, a0
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 1
+; RV64IILLEGALI32-NEXT:    lui a2, 5
+; RV64IILLEGALI32-NEXT:    addiw a2, a2, 1365
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    sub a0, a0, a1
+; RV64IILLEGALI32-NEXT:    lui a1, 3
+; RV64IILLEGALI32-NEXT:    addiw a1, a1, 819
+; RV64IILLEGALI32-NEXT:    and a2, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 2
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    add a0, a2, a0
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 4
+; RV64IILLEGALI32-NEXT:    add a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a1, a0, 15
+; RV64IILLEGALI32-NEXT:    slli a0, a0, 52
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 60
+; RV64IILLEGALI32-NEXT:    add a0, a1, a0
+; RV64IILLEGALI32-NEXT:    li a1, 4
+; RV64IILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a0, a0
+; RV64ZBBILLEGALI32-NEXT:    li a1, 4
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a1, a0, -1
+; RV64ILEGALI32-NEXT:    not a0, a0
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 1
+; RV64ILEGALI32-NEXT:    lui a2, 5
+; RV64ILEGALI32-NEXT:    addi a2, a2, 1365
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    subw a0, a0, a1
+; RV64ILEGALI32-NEXT:    lui a1, 3
+; RV64ILEGALI32-NEXT:    addi a1, a1, 819
+; RV64ILEGALI32-NEXT:    and a2, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a0, a0, 2
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    add a0, a2, a0
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 4
+; RV64ILEGALI32-NEXT:    add a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a1, a0, 15
+; RV64ILEGALI32-NEXT:    slli a0, a0, 52
+; RV64ILEGALI32-NEXT:    srli a0, a0, 60
+; RV64ILEGALI32-NEXT:    add a0, a1, a0
+; RV64ILEGALI32-NEXT:    li a1, 4
+; RV64ILEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a0, a0
+; RV64ZBBLEGALI32-NEXT:    li a1, 4
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true)
+  %res = shl i16 4, %cttz
+  ret i16 %res
+}
+
+define i32 @shl_cttz_i32(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    mul a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI4_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a1, a1
+; RV64ZBB-NEXT:    sllw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_i32_zero_is_defined:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beqz a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a2, a2, 1329
+; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 27
+; RV32I-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI5_0)
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    li a1, 32
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i32_zero_is_defined:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i32_zero_is_defined:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    sext.w a2, a1
+; RV64I-NEXT:    beqz a2, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI5_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    li a1, 32
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i32_zero_is_defined:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a1, a1
+; RV64ZBB-NEXT:    sllw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_constant_i32(i32 %y) {
+; RV32I-LABEL: shl_cttz_constant_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_constant_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addi a1, a1, 1329
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    li a1, 4
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_constant_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a0, a0
+; RV64ZBB-NEXT:    li a1, 4
+; RV64ZBB-NEXT:    sllw a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  %res = shl i32 4, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_multiuse_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a2, a2, 1329
+; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 27
+; RV32I-NEXT:    lui a2, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call use32
+; RV32I-NEXT:    sll a0, s1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_multiuse_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    addi sp, sp, -16
+; RV32ZBB-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    .cfi_offset ra, -4
+; RV32ZBB-NEXT:    .cfi_offset s0, -8
+; RV32ZBB-NEXT:    .cfi_offset s1, -12
+; RV32ZBB-NEXT:    mv s0, a0
+; RV32ZBB-NEXT:    ctz s1, a1
+; RV32ZBB-NEXT:    mv a0, s1
+; RV32ZBB-NEXT:    call use32
+; RV32ZBB-NEXT:    sll a0, s0, s1
+; RV32ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    addi sp, sp, 16
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_multiuse_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI7_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu s0, 0(a1)
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call use32
+; RV64I-NEXT:    sllw a0, s1, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_multiuse_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    addi sp, sp, -32
+; RV64ZBB-NEXT:    .cfi_def_cfa_offset 32
+; RV64ZBB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    .cfi_offset ra, -8
+; RV64ZBB-NEXT:    .cfi_offset s0, -16
+; RV64ZBB-NEXT:    .cfi_offset s1, -24
+; RV64ZBB-NEXT:    mv s0, a0
+; RV64ZBB-NEXT:    ctzw s1, a1
+; RV64ZBB-NEXT:    mv a0, s1
+; RV64ZBB-NEXT:    call use32
+; RV64ZBB-NEXT:    sllw a0, s0, s1
+; RV64ZBB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    addi sp, sp, 32
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  call void @use32(i32 %cttz)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i64 @shl_cttz_i64(i64 %x, i64 %y) {
+; RV32I-LABEL: shl_cttz_i64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a4, 30667
+; RV32I-NEXT:    addi a5, a4, 1329
+; RV32I-NEXT:    lui a4, %hi(.LCPI8_0)
+; RV32I-NEXT:    addi a4, a4, %lo(.LCPI8_0)
+; RV32I-NEXT:    bnez a2, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    neg a2, a3
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    mul a2, a2, a5
+; RV32I-NEXT:    srli a2, a2, 27
+; RV32I-NEXT:    add a2, a4, a2
+; RV32I-NEXT:    lbu a2, 0(a2)
+; RV32I-NEXT:    addi a4, a2, 32
+; RV32I-NEXT:    j .LBB8_3
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    neg a3, a2
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    mul a2, a2, a5
+; RV32I-NEXT:    srli a2, a2, 27
+; RV32I-NEXT:    add a2, a4, a2
+; RV32I-NEXT:    lbu a4, 0(a2)
+; RV32I-NEXT:  .LBB8_3: # %entry
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    sll a2, a0, a4
+; RV32I-NEXT:    bltz a3, .LBB8_5
+; RV32I-NEXT:  # %bb.4: # %entry
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    j .LBB8_6
+; RV32I-NEXT:  .LBB8_5:
+; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    or a1, a1, a0
+; RV32I-NEXT:  .LBB8_6: # %entry
+; RV32I-NEXT:    srai a0, a3, 31
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i64:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    bnez a2, .LBB8_2
+; RV32ZBB-NEXT:  # %bb.1: # %entry
+; RV32ZBB-NEXT:    ctz a2, a3
+; RV32ZBB-NEXT:    addi a4, a2, 32
+; RV32ZBB-NEXT:    j .LBB8_3
+; RV32ZBB-NEXT:  .LBB8_2:
+; RV32ZBB-NEXT:    ctz a4, a2
+; RV32ZBB-NEXT:  .LBB8_3: # %entry
+; RV32ZBB-NEXT:    addi a3, a4, -32
+; RV32ZBB-NEXT:    sll a2, a0, a4
+; RV32ZBB-NEXT:    bltz a3, .LBB8_5
+; RV32ZBB-NEXT:  # %bb.4: # %entry
+; RV32ZBB-NEXT:    mv a1, a2
+; RV32ZBB-NEXT:    j .LBB8_6
+; RV32ZBB-NEXT:  .LBB8_5:
+; RV32ZBB-NEXT:    sll a1, a1, a4
+; RV32ZBB-NEXT:    not a4, a4
+; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    srl a0, a0, a4
+; RV32ZBB-NEXT:    or a1, a1, a0
+; RV32ZBB-NEXT:  .LBB8_6: # %entry
+; RV32ZBB-NEXT:    srai a0, a3, 31
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    neg a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    mul a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i64:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctz a1, a1
+; RV64ZBB-NEXT:    sll a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true)
+  %res = shl i64 %x, %cttz
+  ret i64 %res
+}
+
+define i64 @shl_cttz_constant_i64(i64 %y) {
+; RV32I-LABEL: shl_cttz_constant_i64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a3, a2, 1329
+; RV32I-NEXT:    lui a2, %hi(.LCPI9_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI9_0)
+; RV32I-NEXT:    bnez a0, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    mul a0, a0, a3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a1, a0, 32
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    mul a0, a0, a3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:  .LBB9_3: # %entry
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, a1, -32
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    bltz a2, .LBB9_5
+; RV32I-NEXT:  # %bb.4: # %entry
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    j .LBB9_6
+; RV32I-NEXT:  .LBB9_5:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    srl a1, a3, a1
+; RV32I-NEXT:  .LBB9_6: # %entry
+; RV32I-NEXT:    srai a2, a2, 31
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i64:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    bnez a0, .LBB9_2
+; RV32ZBB-NEXT:  # %bb.1: # %entry
+; RV32ZBB-NEXT:    ctz a0, a1
+; RV32ZBB-NEXT:    addi a1, a0, 32
+; RV32ZBB-NEXT:    j .LBB9_3
+; RV32ZBB-NEXT:  .LBB9_2:
+; RV32ZBB-NEXT:    ctz a1, a0
+; RV32ZBB-NEXT:  .LBB9_3: # %entry
+; RV32ZBB-NEXT:    li a0, 4
+; RV32ZBB-NEXT:    addi a2, a1, -32
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    bltz a2, .LBB9_5
+; RV32ZBB-NEXT:  # %bb.4: # %entry
+; RV32ZBB-NEXT:    mv a1, a0
+; RV32ZBB-NEXT:    j .LBB9_6
+; RV32ZBB-NEXT:  .LBB9_5:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    li a3, 2
+; RV32ZBB-NEXT:    srl a1, a3, a1
+; RV32ZBB-NEXT:  .LBB9_6: # %entry
+; RV32ZBB-NEXT:    srai a2, a2, 31
+; RV32ZBB-NEXT:    and a0, a2, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_constant_i64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_constant_i64:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctz a0, a0
+; RV64ZBB-NEXT:    li a1, 4
+; RV64ZBB-NEXT:    sll a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true)
+  %res = shl i64 4, %cttz
+  ret i64 %res
+}
+
+declare void @use32(i32 signext)

From 23a09b99313edb67d267a974be6cebfdfd97c7c8 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 29 May 2024 10:25:19 +0000
Subject: [PATCH 102/230] [lldb][Test] Remove some xfails for AArch64 Linux

PR #92245 fixed these tests on Linux. They likely work on FreeBSD too
but leaving the xfail for that so it can be confirmed later.

Also updated a bugzilla link to one that redirects to Github issues.

Relates to issues #43398 and #48751.
---
 lldb/test/API/commands/expression/fixits/TestFixIts.py        | 3 +--
 .../expression/static-initializers/TestStaticInitializers.py  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/commands/expression/fixits/TestFixIts.py b/lldb/test/API/commands/expression/fixits/TestFixIts.py
index bc53b72fe611b9..1b22ed1c0077c4 100644
--- a/lldb/test/API/commands/expression/fixits/TestFixIts.py
+++ b/lldb/test/API/commands/expression/fixits/TestFixIts.py
@@ -106,9 +106,8 @@ def test_with_target_error_applies_fixit(self):
         )
         self.assertIn("null_pointer->first", ret_val.GetError())
 
-    # The final function call runs into SIGILL on aarch64-linux.
     @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd", "linux"], bugnumber="llvm.org/pr49407"
+        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49407"
     )
     def test_with_multiple_retries(self):
         """Test calling expressions with errors that can be fixed by the FixIts."""
diff --git a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
index 5fc37ac6a5818a..ea3aa6a4608c41 100644
--- a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
+++ b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
@@ -7,8 +7,8 @@
 class StaticInitializers(TestBase):
     @expectedFailureAll(
         archs="aarch64",
-        oslist=["freebsd", "linux"],
-        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44053",
+        oslist=["freebsd"],
+        bugnumber="llvm.org/pr44053",
     )
     def test(self):
         """Test a static initializer."""

From e93799f260e881ff2f7c0fd7afc78374d615d70e Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 29 May 2024 11:34:24 +0100
Subject: [PATCH 103/230] [SME] Add intrinsics for FCVT(wid.) and FCVTL
 (#93202)

According to the specification in
https://github.com/ARM-software/acle/pull/309 this adds the intrinsics
```
svfloat32x2_t svcvt_f32[_f16_x2](svfloat16_t zn) __arm_streaming;
svfloat32x2_t svcvtl_f32[_f16_x2](svfloat16_t zn) __arm_streaming;

```
These are available only if __ARM_FEATURE_SME_F16F16 is enabled.

---------

Co-authored-by: Caroline Concatto <caroline.concatto@arm.com>
---
 clang/include/clang/Basic/arm_sve.td          | 11 +++++
 .../aarch64-sme2-intrinsics/acle_sme2_cvt.c   | 22 ++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_cvtl.c  | 40 +++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 14 ++++++-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  6 +++
 .../CodeGen/AArch64/sme2-intrinsics-cvt.ll    | 11 ++++-
 .../CodeGen/AArch64/sme2-intrinsics-cvtl.ll   | 11 +++++
 7 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 078ef576342a7c..88938a981fd8ae 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2270,6 +2270,10 @@ let TargetGuard = "sme2" in {
   def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 }
 
+let TargetGuard = "sme-f16f16" in {
+  def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
 //
@@ -2278,6 +2282,13 @@ let TargetGuard = "sme2" in {
   def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
 }
 
+//
+//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+//
+let TargetGuard = "sme-f16f16" in {
+  def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector saturating extract narrow
 //
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 4a5ee7e021f748..e26499d3a63cc4 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) __arm_streaming {
 svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn);
 }
+
+// CHECK-LABEL: @test_cvt_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+__attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
new file mode 100644
index 00000000000000..453dd3db6adf09
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_cvtl_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 57d0dfb698b383..f2028f8e8fd05a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
+  
+  class SME2_CVT_WIDENING_VG2_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
+  
 
   class SME2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3412,6 +3417,13 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_suvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
   def int_aarch64_sme_usvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
+
+  //
+  //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+  //
+  
+  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+
   //
   // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
   //
@@ -3431,7 +3443,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-
+  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 660675cf8f3895..8fd58f4698d280 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5717,6 +5717,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sve_ucvtf_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS);
       return;
+    case Intrinsic::aarch64_sve_fcvt_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S);
+      return;
+    case Intrinsic::aarch64_sve_fcvtl_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index bc1db878cbd313..611cdcda157e21 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; FCVT
@@ -139,6 +139,15 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvt_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvt_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
new file mode 100644
index 00000000000000..30dc7cbfaea6c9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
+
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}

From 1e44a9690915e8acf7b2a0e67b56aaf4509e9257 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 29 May 2024 11:35:21 +0100
Subject: [PATCH 104/230] [AArch64][SME] Add intrinsics for vector groups ZERO
 (#93201)

According to the specification in
https://github.com/ARM-software/acle/pull/309 this adds the intrinsics:

  void svzero_za64_vg1x2(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg1x4(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg2x1(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg2x2(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg2x4(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg4x1(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg4x2(uint32_t slice)
    __arm_streaming __arm_inout("za");

  void svzero_za64_vg4x4(uint32_t slice)
    __arm_streaming __arm_inout("za");
---
 clang/include/clang/Basic/arm_sme.td          |  19 ++
 .../acle_sme2p1_zero.c                        | 139 +++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   6 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  46 ++++-
 .../CodeGen/AArch64/sme2p1-intrinsics-zero.ll | 190 ++++++++++++++++++
 5 files changed, 391 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 80e635e4a57eca..564a58e4eb6709 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -146,6 +146,25 @@ let TargetGuard = "sme" in {
                              [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
+let TargetGuard = "sme2p1" in {
+  def SVZERO_ZA64_VG1x2 : SInst<"svzero_za64_vg1x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG1x4 : SInst<"svzero_za64_vg1x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x1 : SInst<"svzero_za64_vg2x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x1",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x2 : SInst<"svzero_za64_vg2x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x4 : SInst<"svzero_za64_vg2x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x1 : SInst<"svzero_za64_vg4x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x1",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x2 : SInst<"svzero_za64_vg4x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x4 : SInst<"svzero_za64_vg4x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Counting elements in a streaming vector
 
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c
new file mode 100644
index 00000000000000..2ad2044c267ed0
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c
@@ -0,0 +1,139 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg1x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   SVE_ACLE_FUNC(svzero_za64,_vg1x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg1x4(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg1x4)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x1j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x1(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x1)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x2(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x4(uint32_t slice)  __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x4)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x1j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x1(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x1)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x2(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x4(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x4)(slice);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index f2028f8e8fd05a..9a71aaa9f44349 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3361,6 +3361,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
   def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
 
+  // Multi-vector zeroing
+
+  foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
+    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrNoMem, IntrHasSideEffects]>;
+  }
+  
   // Multi-vector signed saturating doubling multiply high
 
   def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 50ee37b0dfebc8..b21b1faf5c9622 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,13 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
   let usesCustomInserter = 1;
 }
 
+class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
+    : SMEPseudo2Instr<name, 0>,
+      Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
+  let SMEMatrixType = za_flag;
+  let usesCustomInserter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -189,6 +196,9 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
     : Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
           (!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
 
+class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
+    : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
+    (!cast<Instruction>(name) $base, $offset)>; 
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -4815,39 +4825,57 @@ class sme2p1_zero_matrix<bits<6> opc, Operand index_ty, string mnemonic,
 }
 
 multiclass sme2p1_zero_matrix<string mnemonic> {
-  def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2"> {
+  def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_Z , 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic> {
+  def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _2Z, 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2"> {
+  def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_2Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4"> {
+  def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_2Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4"> {
+  def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_Z, 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic> {
+  def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _4Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG2_4Z :sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2"> {
+  def _VG2_4Z : sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_4Z, 1> {
     bits<1> imm;
     let Inst{0}   = imm;
   }
-  def _VG4_4Z :sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4"> {
+  def _VG4_4Z : sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_4Z, 1> {
     bits<1> imm;
     let Inst{0}   = imm;
   }
-}
+
+  def NAME # _VG2_Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_Z, sme_elm_idx0_7, SMEMatrixArray>;
+  def NAME # _VG4_Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_Z, sme_elm_idx0_7, SMEMatrixArray>;
+  def NAME # _2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _2Z, uimm2s2range, SMEMatrixArray>;
+  def NAME # _VG2_2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_2Z, uimm1s2range, SMEMatrixArray>;
+  def NAME # _VG4_2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_2Z, uimm1s2range, SMEMatrixArray>;
+  def NAME # _4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _4Z, uimm1s4range, SMEMatrixArray>;
+  def NAME # _VG2_4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_4Z, uimm0s4range, SMEMatrixArray>;
+  def NAME # _VG4_4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_4Z, uimm0s4range, SMEMatrixArray>;
+
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_Z_PSEUDO, int_aarch64_sme_zero_za64_vg1x2, sme_elm_idx0_7, tileslice16>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_Z_PSEUDO, int_aarch64_sme_zero_za64_vg1x4, sme_elm_idx0_7, tileslice16>;
+  def : SME2_Zero_Matrix_Pat<NAME # _2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x1, uimm2s2range, tileslicerange2s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x2, uimm1s2range, tileslicerange1s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x4, uimm1s2range, tileslicerange1s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x1, uimm1s4range, tileslicerange1s4>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x2, uimm0s4range, tileslicerange0s4>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x4, uimm0s4range, tileslicerange0s4>;
+} 
 
 //===----------------------------------------------------------------------===//
 // SME2.1 lookup table expand two non-contiguous registers
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
new file mode 100644
index 00000000000000..ba77637580f4cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  void @test_svzero_za64_vg1x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 7
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice.min)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x1(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x1_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x1_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 6:7]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 6
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 2:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 2
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w0, #1
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice.min)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x1(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x1_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x1_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 4:7]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 4
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 0
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w0, #1
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice.min)
+  ret void
+}
+
+attributes #0 = { nounwind "target-features" = "+sme2p1"}

From 7fa45afa938e0feb0030b14a8633de7dd8e529cb Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Wed, 29 May 2024 12:52:55 +0200
Subject: [PATCH 105/230] [SPIR-V] Ensure that internal intrinsic functions are
 inserted at the correct positions (#93552)

The goal of the PR is to ensure that newly inserted internal intrinsic
functions are inserted at the correct positions, and don't break rules
of instruction domination and PHI nodes grouping at top of basic block.
This is a continuation of
https://github.com/llvm/llvm-project/pull/92316 and
https://github.com/llvm/llvm-project/pull/92536
---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 43 ++++++++++++++-----
 .../SPIRV/phi-spvintrinsic-dominate.ll        | 39 +++++++++++++++++
 2 files changed, 71 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index ea53fe55e7ab57..e4bbeb53d16913 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -181,6 +181,14 @@ static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) {
     B.SetInsertPoint(I);
 }
 
+static void setInsertPointAfterDef(IRBuilder<> &B, Instruction *I) {
+  B.SetCurrentDebugLocation(I->getDebugLoc());
+  if (I->getType()->isVoidTy())
+    B.SetInsertPoint(I->getNextNode());
+  else
+    B.SetInsertPoint(*I->getInsertionPointAfterDef());
+}
+
 static bool requireAssignType(Instruction *I) {
   IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
   if (Intr) {
@@ -560,6 +568,7 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) {
 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.front();
+    bool BPrepared = false;
     Worklist.pop();
 
     for (auto &Op : I->operands()) {
@@ -567,7 +576,10 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) {
       if (!AggrUndef || !Op->getType()->isAggregateType())
         continue;
 
-      B.SetInsertPoint(I);
+      if (!BPrepared) {
+        setInsertPointSkippingPhis(B, I);
+        BPrepared = true;
+      }
       auto *IntrUndef = B.CreateIntrinsic(Intrinsic::spv_undef, {}, {});
       Worklist.push(IntrUndef);
       I->replaceUsesOfWith(Op, IntrUndef);
@@ -584,6 +596,7 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
 
   while (!Worklist.empty()) {
     auto *I = Worklist.front();
+    bool IsPhi = isa<PHINode>(I), BPrepared = false;
     assert(I);
     bool KeepInst = false;
     for (const auto &Op : I->operands()) {
@@ -615,7 +628,11 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
         else
           for (auto &COp : AggrConst->operands())
             Args.push_back(COp);
-        B.SetInsertPoint(I);
+        if (!BPrepared) {
+          IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent())
+                : B.SetInsertPoint(I);
+          BPrepared = true;
+        }
         auto *CI =
             B.CreateIntrinsic(Intrinsic::spv_const_composite, {ResTy}, {Args});
         Worklist.push(CI);
@@ -1111,8 +1128,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
       isa<BitCastInst>(I))
     return;
 
-  setInsertPointSkippingPhis(B, I->getNextNode());
-
+  setInsertPointAfterDef(B, I);
   Type *ElemTy = deduceElementType(I);
   Constant *EltTyConst = UndefValue::get(ElemTy);
   unsigned AddressSpace = getPointerAddressSpace(I->getType());
@@ -1127,7 +1143,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
   reportFatalOnTokenType(I);
   Type *Ty = I->getType();
   if (!Ty->isVoidTy() && !isPointerTy(Ty) && requireAssignType(I)) {
-    setInsertPointSkippingPhis(B, I->getNextNode());
+    setInsertPointAfterDef(B, I);
     Type *TypeToAssign = Ty;
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
       if (II->getIntrinsicID() == Intrinsic::spv_const_composite ||
@@ -1149,7 +1165,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
       if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
         buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
                         UndefValue::get(B.getInt32Ty()), {}, B);
-      else if (!isa<Instruction>(Op)) // TODO: This case could be removed
+      else if (!isa<Instruction>(Op))
         buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {},
                         B);
     }
@@ -1159,7 +1175,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
 void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
                                                  IRBuilder<> &B) {
   if (MDNode *MD = I->getMetadata("spirv.Decorations")) {
-    B.SetInsertPoint(I->getNextNode());
+    setInsertPointAfterDef(B, I);
     B.CreateIntrinsic(Intrinsic::spv_assign_decoration, {I->getType()},
                       {I, MetadataAsValue::get(I->getContext(), MD)});
   }
@@ -1170,7 +1186,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   auto *II = dyn_cast<IntrinsicInst>(I);
   if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite &&
       TrackConstants) {
-    B.SetInsertPoint(I->getNextNode());
+    setInsertPointAfterDef(B, I);
     auto t = AggrConsts.find(I);
     assert(t != AggrConsts.end());
     auto *NewOp =
@@ -1179,6 +1195,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
     I->replaceAllUsesWith(NewOp);
     NewOp->setArgOperand(0, I);
   }
+  bool IsPhi = isa<PHINode>(I), BPrepared = false;
   for (const auto &Op : I->operands()) {
     if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
         isa<PHINode>(I) || isa<SwitchInst>(I))
@@ -1188,7 +1205,11 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
       if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) ||
                  (II->paramHasAttr(OpNo, Attribute::ImmArg))))
         continue;
-      B.SetInsertPoint(I);
+      if (!BPrepared) {
+        IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent())
+              : B.SetInsertPoint(I);
+        BPrepared = true;
+      }
       Value *OpTyVal = Op;
       if (Op->getType()->isTargetExtTy())
         OpTyVal = Constant::getNullValue(
@@ -1201,7 +1222,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   }
   if (I->hasName()) {
     reportFatalOnTokenType(I);
-    setInsertPointSkippingPhis(B, I->getNextNode());
+    setInsertPointAfterDef(B, I);
     std::vector<Value *> Args = {I};
     addStringImm(I->getName(), B, Args);
     B.CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args);
@@ -1345,7 +1366,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   for (auto *I : Worklist) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
-      B.SetInsertPoint(I->getNextNode());
+      setInsertPointAfterDef(B, I);
     // Visitors return either the original/newly created instruction for further
     // processing, nullptr otherwise.
     I = visit(*I);
diff --git a/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll
new file mode 100644
index 00000000000000..471ab03ed89f65
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll
@@ -0,0 +1,39 @@
+; The goal of the test is to check that newly inserted internal (spv)
+; intrinsic functions for PHI's operands are inserted at the correct
+; positions, and don't break rules of instruction domination and PHI nodes
+; grouping at top of basic block.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpFunction
+; CHECK: OpBranch
+; CHECK: OpLabel
+; CHECK: OpPhi
+; CHECK: OpPhi
+; CHECK: OpPhi
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg1) {
+entry:
+  br label %l1
+
+l1:
+  %sw = phi <4 x double> [ %vec, %l2 ], [ <double 0.0, double 0.0, double 0.0, double poison>, %entry ]
+  %in = phi <3 x double> [ %ins, %l2 ], [ zeroinitializer, %entry ]
+  %r1 = phi i32 [ %r2, %l2 ], [ 0, %entry ]
+  %c1 = icmp ult i32 %r1, 3
+  br i1 %c1, label %l2, label %exit
+
+l2:
+  %r3 = zext nneg i32 %r1 to i64
+  %r4 = getelementptr inbounds double, ptr addrspace(1) %_arg1, i64 %r3
+  %r5 = load double, ptr addrspace(1) %r4, align 8
+  %ins = insertelement <3 x double> %in, double %r5, i32 %r1
+  %exp = shufflevector <3 x double> %ins, <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  %vec = shufflevector <4 x double> %exp, <4 x double> %sw, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %r2 = add nuw nsw i32 %r1, 1
+  br label %l1
+
+exit:
+  ret void
+}

From f63adf3b51008970cc7c3794c68c7a6e33e8d5dd Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Wed, 29 May 2024 12:53:08 +0200
Subject: [PATCH 106/230] [SPIR-V] Introduce support of llvm.ptr.annotation to
 SPIR-V Backend and implement extensions which make use of spirv.Decorations
 (#93561)

This PR introduces support of llvm.ptr.annotation to SPIR-V Backend, and
implement several extensions which make use of spirv.Decorations and
llvm.ptr.annotation to annotate global variables and pointers:

- SPV_INTEL_cache_controls
- SPV_INTEL_global_variable_host_access
- SPV_INTEL_global_variable_fpga_decorations
---
 llvm/docs/SPIRVUsage.rst                      |   6 +
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp   |   7 +
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    |   7 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |   9 ++
 .../Target/SPIRV/SPIRVPrepareFunctions.cpp    | 133 ++++++++++++++++++
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |  12 ++
 .../basic-load-store.ll                       |  53 +++++++
 .../decorate-prefetch-w-cache-controls.ll     |  44 ++++++
 .../global-var-decorations.ll                 |  33 +++++
 .../global-var-host-access.ll                 |  34 +++++
 .../SPIRV/llvm-intrinsics/ptr-annotation.ll   |  41 ++++++
 11 files changed, 379 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 657b0fb9b6724c..de27f6b2372db6 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -141,10 +141,16 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
+   * - ``SPV_INTEL_cache_controls``
+     - Allows cache control information to be applied to memory access instructions.
    * - ``SPV_INTEL_function_pointers``
      - Allows translation of function pointers.
    * - ``SPV_INTEL_inline_assembly``
      - Allows to use inline assembly.
+   * - ``SPV_INTEL_global_variable_host_access``
+     - Adds decorations that can be applied to global (module scope) variables.
+   * - ``SPV_INTEL_global_variable_fpga_decorations``
+     - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
    * - ``SPV_INTEL_optnone``
      - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_INTEL_subgroups``
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 5c286acdcc9b39..ff8759755e5176 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -272,6 +272,13 @@ void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) {
     case Decoration::UserSemantic:
       printStringImm(MI, NumFixedOps, O);
       break;
+    case Decoration::HostAccessINTEL:
+      printOperand(MI, NumFixedOps, O);
+      if (NumFixedOps + 1 < MI->getNumOperands()) {
+        O << ' ';
+        printStringImm(MI, NumFixedOps + 1, O);
+      }
+      break;
     default:
       printRemainingVariableOps(MI, NumFixedOps, O, true);
       break;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 7f531542544ab6..75aa1823b11f2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -30,6 +30,13 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max},
         {"SPV_INTEL_arbitrary_precision_integers",
          SPIRV::Extension::Extension::SPV_INTEL_arbitrary_precision_integers},
+        {"SPV_INTEL_cache_controls",
+         SPIRV::Extension::Extension::SPV_INTEL_cache_controls},
+        {"SPV_INTEL_global_variable_fpga_decorations",
+         SPIRV::Extension::Extension::
+             SPV_INTEL_global_variable_fpga_decorations},
+        {"SPV_INTEL_global_variable_host_access",
+         SPIRV::Extension::Extension::SPV_INTEL_global_variable_host_access},
         {"SPV_INTEL_optnone", SPIRV::Extension::Extension::SPV_INTEL_optnone},
         {"SPV_INTEL_usm_storage_classes",
          SPIRV::Extension::Extension::SPV_INTEL_usm_storage_classes},
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index c86ab285f354fd..61f99f8d852695 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -703,6 +703,15 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex,
         static_cast<SPIRV::LinkageType::LinkageType>(LinkageOp);
     if (LnkType == SPIRV::LinkageType::LinkOnceODR)
       Reqs.addExtension(SPIRV::Extension::SPV_KHR_linkonce_odr);
+  } else if (Dec == SPIRV::Decoration::CacheControlLoadINTEL ||
+             Dec == SPIRV::Decoration::CacheControlStoreINTEL) {
+    Reqs.addExtension(SPIRV::Extension::SPV_INTEL_cache_controls);
+  } else if (Dec == SPIRV::Decoration::HostAccessINTEL) {
+    Reqs.addExtension(SPIRV::Extension::SPV_INTEL_global_variable_host_access);
+  } else if (Dec == SPIRV::Decoration::InitModeINTEL ||
+             Dec == SPIRV::Decoration::ImplementInRegisterMapINTEL) {
+    Reqs.addExtension(
+        SPIRV::Extension::SPV_INTEL_global_variable_fpga_decorations);
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index a8a0577f60564c..7bee87d7204ede 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -22,6 +22,7 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -29,6 +30,8 @@
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include <charconv>
+#include <regex>
 
 using namespace llvm;
 
@@ -152,6 +155,132 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
   return true;
 }
 
+static std::string getAnnotation(Value *AnnoVal, Value *OptAnnoVal) {
+  if (auto *Ref = dyn_cast_or_null<GetElementPtrInst>(AnnoVal))
+    AnnoVal = Ref->getOperand(0);
+  if (auto *Ref = dyn_cast_or_null<BitCastInst>(OptAnnoVal))
+    OptAnnoVal = Ref->getOperand(0);
+
+  std::string Anno;
+  if (auto *C = dyn_cast_or_null<Constant>(AnnoVal)) {
+    StringRef Str;
+    if (getConstantStringInfo(C, Str))
+      Anno = Str;
+  }
+  // handle optional annotation parameter in a way that Khronos Translator do
+  // (collect integers wrapped in a struct)
+  if (auto *C = dyn_cast_or_null<Constant>(OptAnnoVal);
+      C && C->getNumOperands()) {
+    Value *MaybeStruct = C->getOperand(0);
+    if (auto *Struct = dyn_cast<ConstantStruct>(MaybeStruct)) {
+      for (unsigned I = 0, E = Struct->getNumOperands(); I != E; ++I) {
+        if (auto *CInt = dyn_cast<ConstantInt>(Struct->getOperand(I)))
+          Anno += (I == 0 ? ": " : ", ") +
+                  std::to_string(CInt->getType()->getIntegerBitWidth() == 1
+                                     ? CInt->getZExtValue()
+                                     : CInt->getSExtValue());
+      }
+    } else if (auto *Struct = dyn_cast<ConstantAggregateZero>(MaybeStruct)) {
+      // { i32 i32 ... } zeroinitializer
+      for (unsigned I = 0, E = Struct->getType()->getStructNumElements();
+           I != E; ++I)
+        Anno += I == 0 ? ": 0" : ", 0";
+    }
+  }
+  return Anno;
+}
+
+static SmallVector<Metadata *> parseAnnotation(Value *I,
+                                               const std::string &Anno,
+                                               LLVMContext &Ctx,
+                                               Type *Int32Ty) {
+  // Try to parse the annotation string according to the following rules:
+  // annotation := ({kind} | {kind:value,value,...})+
+  // kind := number
+  // value := number | string
+  static const std::regex R(
+      "\\{(\\d+)(?:[:,](\\d+|\"[^\"]*\")(?:,(\\d+|\"[^\"]*\"))*)?\\}");
+  SmallVector<Metadata *> MDs;
+  int Pos = 0;
+  for (std::sregex_iterator
+           It = std::sregex_iterator(Anno.begin(), Anno.end(), R),
+           ItEnd = std::sregex_iterator();
+       It != ItEnd; ++It) {
+    if (It->position() != Pos)
+      return SmallVector<Metadata *>{};
+    Pos = It->position() + It->length();
+    std::smatch Match = *It;
+    SmallVector<Metadata *> MDsItem;
+    for (std::size_t i = 1; i < Match.size(); ++i) {
+      std::ssub_match SMatch = Match[i];
+      std::string Item = SMatch.str();
+      if (Item.length() == 0)
+        break;
+      if (Item[0] == '"') {
+        Item = Item.substr(1, Item.length() - 2);
+        // Acceptable format of the string snippet is:
+        static const std::regex RStr("^(\\d+)(?:,(\\d+))*$");
+        if (std::smatch MatchStr; std::regex_match(Item, MatchStr, RStr)) {
+          for (std::size_t SubIdx = 1; SubIdx < MatchStr.size(); ++SubIdx)
+            if (std::string SubStr = MatchStr[SubIdx].str(); SubStr.length())
+              MDsItem.push_back(ConstantAsMetadata::get(
+                  ConstantInt::get(Int32Ty, std::stoi(SubStr))));
+        } else {
+          MDsItem.push_back(MDString::get(Ctx, Item));
+        }
+      } else if (int32_t Num;
+                 std::from_chars(Item.data(), Item.data() + Item.size(), Num)
+                     .ec == std::errc{}) {
+        MDsItem.push_back(
+            ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Num)));
+      } else {
+        MDsItem.push_back(MDString::get(Ctx, Item));
+      }
+    }
+    if (MDsItem.size() == 0)
+      return SmallVector<Metadata *>{};
+    MDs.push_back(MDNode::get(Ctx, MDsItem));
+  }
+  return Pos == static_cast<int>(Anno.length()) ? MDs
+                                                : SmallVector<Metadata *>{};
+}
+
+static void lowerPtrAnnotation(IntrinsicInst *II) {
+  LLVMContext &Ctx = II->getContext();
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+  // Retrieve an annotation string from arguments.
+  Value *PtrArg = nullptr;
+  if (auto *BI = dyn_cast<BitCastInst>(II->getArgOperand(0)))
+    PtrArg = BI->getOperand(0);
+  else
+    PtrArg = II->getOperand(0);
+  std::string Anno =
+      getAnnotation(II->getArgOperand(1),
+                    4 < II->arg_size() ? II->getArgOperand(4) : nullptr);
+
+  // Parse the annotation.
+  SmallVector<Metadata *> MDs = parseAnnotation(II, Anno, Ctx, Int32Ty);
+
+  // If the annotation string is not parsed successfully we don't know the
+  // format used and output it as a general UserSemantic decoration.
+  // Otherwise MDs is a Metadata tuple (a decoration list) in the format
+  // expected by `spirv.Decorations`.
+  if (MDs.size() == 0) {
+    auto UserSemantic = ConstantAsMetadata::get(ConstantInt::get(
+        Int32Ty, static_cast<uint32_t>(SPIRV::Decoration::UserSemantic)));
+    MDs.push_back(MDNode::get(Ctx, {UserSemantic, MDString::get(Ctx, Anno)}));
+  }
+
+  // Build the internal intrinsic function.
+  IRBuilder<> IRB(II->getParent());
+  IRB.SetInsertPoint(II);
+  IRB.CreateIntrinsic(
+      Intrinsic::spv_assign_decoration, {PtrArg->getType()},
+      {PtrArg, MetadataAsValue::get(Ctx, MDNode::get(Ctx, MDs))});
+  II->replaceAllUsesWith(II->getOperand(0));
+}
+
 static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) {
   // Get a separate function - otherwise, we'd have to rework the CFG of the
   // current one. Then simply replace the intrinsic uses with a call to the new
@@ -334,6 +463,10 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         Changed |= toSpvOverloadedIntrinsic(
             II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
         break;
+      case Intrinsic::ptr_annotation:
+        lowerPtrAnnotation(II);
+        Changed = true;
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 98cbd9d2c1f2e4..65b48c8acf6ab7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -299,6 +299,9 @@ defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
 defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
+defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
+defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
+defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -471,6 +474,10 @@ defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variabl
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
+defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
+defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
+defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>;
+defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -1206,6 +1213,11 @@ defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectRefe
 defm ClobberINTEL : DecorationOperand<5607, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm SideEffectsINTEL : DecorationOperand<5608, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
+defm CacheControlLoadINTEL : DecorationOperand<6442, 0, 0, [], [CacheControlsINTEL]>;
+defm CacheControlStoreINTEL : DecorationOperand<6443, 0, 0, [], [CacheControlsINTEL]>;
+defm HostAccessINTEL : DecorationOperand<6188, 0, 0, [], [GlobalVariableHostAccessINTEL]>;
+defm InitModeINTEL : DecorationOperand<6190, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>;
+defm ImplementInRegisterMapINTEL : DecorationOperand<6191, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define BuiltIn enum values and at the same time
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll
new file mode 100644
index 00000000000000..359f6d1c0f8e53
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll
@@ -0,0 +1,53 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability CacheControlsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls"
+; CHECK-SPIRV-DAG: OpName %[[#GVar:]] "G"
+; CHECK-SPIRV-DAG: OpName %[[#Arg:]] "buffer"
+; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 1 3
+; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlLoadINTEL 0 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr:]] CacheControlLoadINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr]] CacheControlLoadINTEL 1 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr:]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr]] CacheControlStoreINTEL 1 2
+; CHECK-SPIRV: OpLoad %[[#]] %[[#LoadPtr]]
+; CHECK-SPIRV: OpStore %[[#StorePtr]] %[[#]]
+
+@G = common addrspace(1) global i32 0, align 4, !spirv.Decorations !9
+
+define spir_kernel void @test(ptr addrspace(1) %dummy, ptr addrspace(1) %buffer) !spirv.ParameterDecorations !12 {
+entry:
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 1, !spirv.Decorations !3
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 0, !spirv.Decorations !6
+  store i32 %0, ptr addrspace(1) %arrayidx1, align 4
+  ret void
+}
+
+!spirv.MemoryModel = !{!0}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 102000}
+!2 = !{i32 1, i32 2}
+!3 = !{!4, !5}
+!4 = !{i32 6442, i32 0, i32 1}  ; {CacheControlLoadINTEL, CacheLevel=0, Cached}
+!5 = !{i32 6442, i32 1, i32 1}  ; {CacheControlLoadINTEL, CacheLevel=1, Cached}
+!6 = !{!7, !8}
+!7 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough}
+!8 = !{i32 6443, i32 1, i32 2}  ; {CacheControlStoreINTEL, CacheLevel=1, WriteBack}
+!9 = !{!10, !11}
+!10 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough}
+!11 = !{i32 6443, i32 1, i32 3}  ; {CacheControlStoreINTEL, CacheLevel=1, Streaming}
+!12 = !{!13, !14}
+!13 = !{}
+!14 = !{!15, !16}
+!15 = !{i32 6442, i32 0, i32 0}  ; {CacheControlLoadINTEL,   CacheLevel=0, Uncached}
+!16 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL,  CacheLevel=0, WriteThrough}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
new file mode 100644
index 00000000000000..9a13b720f61f74
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
@@ -0,0 +1,44 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability CacheControlsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls"
+
+; CHECK-SPIRV-DAG: OpName %[[#Ptr1:]] "ptr1"
+; CHECK-SPIRV-DAG: OpName %[[#Ptr2:]] "ptr2"
+; CHECK-SPIRV-DAG: OpName %[[#Ptr3:]] "ptr3"
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr1]] CacheControlLoadINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr2]] CacheControlLoadINTEL 1 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr3]] CacheControlStoreINTEL 2 3
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr1]] %[[#]]
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr2]] %[[#]]
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr3]] %[[#]]
+
+; 6442 stands for CacheControlLoadINTEL token
+@.str.1 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata"
+@.str.9 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\220,1\22}\00", section "llvm.metadata"
+@.str.10 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\221,1\22}\00", section "llvm.metadata"
+@.str.11 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6443:\222,3\22}\00", section "llvm.metadata"
+
+define weak_odr dso_local spir_kernel void @foo(ptr addrspace(1) noundef align 1 %_arg_dataPtr) {
+entry:
+  %r0 = addrspacecast ptr addrspace(1) %_arg_dataPtr to ptr addrspace(4)
+  %ptr1 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %r0, i32 noundef 5)
+  %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr1, ptr addrspace(1) @.str.9, ptr addrspace(1) @.str.1, i32 76, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r1, i64 noundef 1)
+  %arrayidx3.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 1
+  %ptr2 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx3.i, i32 noundef 5)
+  %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr2, ptr addrspace(1) @.str.10, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r2, i64 noundef 1)
+  %arrayidx7.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 2
+  %ptr3 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx7.i, i32 noundef 5)
+  %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr3, ptr addrspace(1) @.str.11, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r3, i64 noundef 2)
+  ret void
+}
+
+declare ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1))
+declare dso_local spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef, i64 noundef)
+declare dso_local spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef, i32 noundef)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll
new file mode 100644
index 00000000000000..40008873bf19bf
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll
@@ -0,0 +1,33 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_fpga_decorations
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability GlobalVariableFPGADecorationsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_global_variable_fpga_decorations"
+; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var"
+; CHECK-SPIRV-DAG: OpName %[[#G2:]] "float_var"
+; CHECK-SPIRV-DAG: OpName %[[#G3:]] "bool_var"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] InitModeINTEL 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] InitModeINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 0
+
+@int_var = addrspace(1) global i32 42, !spirv.Decorations !1
+@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5
+@bool_var = addrspace(1) global i1 0, !spirv.Decorations !7
+
+define spir_kernel void @test() {
+entry:
+  ret void
+}
+
+!1 = !{!2, !3}
+!2 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true
+!3 = !{i32 6190, i32 0} ; InitModeINTEL = 0
+!4 = !{i32 6190, i32 1} ; InitModeINTEL = 1
+!5 = !{!2, !4}
+!6 = !{i32 6191, i1 false} ; ImplementInRegisterMapINTEL = false
+!7 = !{!6, !3}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll
new file mode 100644
index 00000000000000..1397435efb2d4f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll
@@ -0,0 +1,34 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_host_access
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: Capability GlobalVariableHostAccessINTEL
+; CHECK-SPIRV-DAG: Capability GlobalVariableFPGADecorationsINTEL
+; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_host_access"
+; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_fpga_decorations"
+
+; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var"
+; CHECK-SPIRV-DAG: OpName %[[#G2:]] "bool_var"
+; CHECK-SPIRV-DAG: OpName %[[#G3:]] "float_var"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] HostAccessINTEL 1 "IntVarName"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] HostAccessINTEL 3 "BoolVarName"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 1
+
+@int_var = addrspace(1) global i32 42, !spirv.Decorations !1
+@bool_var = addrspace(1) global i1 0, !spirv.Decorations !4
+@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5
+
+define spir_kernel void @test() {
+entry:
+  ret void
+}
+
+!1 = !{!2}
+!2 = !{i32 6188, i32 1, !"IntVarName"} ; HostAccessINTEL 1 "IntVarName"
+!3 = !{i32 6188, i32 3, !"BoolVarName"} ; HostAccessINTEL 3 "BoolVarName"
+!4 = !{!3}
+!5 = !{!6, !7}
+!6 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true
+!7 = !{i32 6190, i32 1} ; InitModeINTEL = 1
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll
new file mode 100644
index 00000000000000..06f1d0bf7fd37c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll
@@ -0,0 +1,41 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[#Foo:]] "foo"
+; CHECK-DAG: OpName %[[#Ptr1:]] "_arg1"
+; CHECK-DAG: OpName %[[#Ptr2:]] "_arg2"
+; CHECK-DAG: OpName %[[#Ptr3:]] "_arg3"
+; CHECK-DAG: OpName %[[#Ptr4:]] "_arg4"
+; CHECK-DAG: OpName %[[#Ptr5:]] "_arg5"
+; CHECK-DAG: OpDecorate %[[#Ptr1]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr2]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr2]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr3]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr3]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr4]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr4]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr5]] UserSemantic "Unknown format"
+; CHECK: %[[#Foo]] = OpFunction
+; CHECK-NEXT: %[[#Ptr1]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr2]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr3]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr4]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr5]] = OpFunctionParameter
+; CHECK: OpFunctionEnd
+
+@.str.0 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr addrspace(1) constant [5 x i8] c"{25}\00", section "llvm.metadata"
+@.str.2 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44:128}{25}\00", section "llvm.metadata"
+@.str.3 = private unnamed_addr addrspace(1) constant [15 x i8] c"{44:\22128\22}{25}\00", section "llvm.metadata"
+@.str.4 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44,128}{25}\00", section "llvm.metadata"
+@.str.5 = private unnamed_addr addrspace(1) constant [15 x i8] c"Unknown format\00", section "llvm.metadata"
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg1, ptr addrspace(1) %_arg2, ptr addrspace(1) %_arg3, ptr addrspace(1) %_arg4, ptr addrspace(1) %_arg5) {
+entry:
+  %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg1, ptr addrspace(1) @.str.1, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg2, ptr addrspace(1) @.str.2, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg3, ptr addrspace(1) @.str.3, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r4 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg4, ptr addrspace(1) @.str.4, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r5 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg5, ptr addrspace(1) @.str.5, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  ret void
+}

From 7c917e8268225735bf6fe0f7d8491fc944358e47 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Wed, 29 May 2024 12:53:37 +0200
Subject: [PATCH 107/230] [SPIR-V] Implement correct zeroinitializer for
 extension types in SPIR-V Backend (#93607)

This PR implements correct zeroinitializer for extension types in SPIR-V
Backend.

Previous version has just created 0 of 32/64 integer type (depending on
target machine word size), that caused re-use and type re-write of the
corresponding integer constant 0 with a potential crash on wrong usage
of the constant (i.e., 0 of integer type expected but extension type
found). E.g., the following code would crash without the PR:

```
  %r1 = icmp ne i64 %_arg_i, 0
  %e1 = tail call spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32 2, ptr addrspace(3) %_arg_local, ptr addrspace(1) %_arg_ptr, i64 1, i64 1, target("spirv.Event") zeroinitializer)
```

because 0 in icmp would eventually be of `Event` type.
---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp |  3 +-
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp   | 29 +++++++++++++------
 llvm/test/CodeGen/SPIRV/event-zero-const.ll   | 23 +++++++++++++++
 3 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/event-zero-const.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e4bbeb53d16913..ffbd1e17bad5e7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1212,8 +1212,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
       }
       Value *OpTyVal = Op;
       if (Op->getType()->isTargetExtTy())
-        OpTyVal = Constant::getNullValue(
-            IntegerType::get(I->getContext(), GR->getPointerSize()));
+        OpTyVal = PoisonValue::get(Op->getType());
       auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
                                     {Op->getType(), OpTyVal->getType()}, Op,
                                     OpTyVal, {}, B);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 85299a49a6b94d..624899600693ac 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -40,6 +40,7 @@ class SPIRVPreLegalizer : public MachineFunctionPass {
 
 static void
 addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                    const SPIRVSubtarget &STI,
                     DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
@@ -82,8 +83,17 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           if (Const->getType()->isTargetExtTy()) {
             // remember association so that we can restore it when assign types
             MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
-            if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT)
+            if (SrcMI && (SrcMI->getOpcode() == TargetOpcode::G_CONSTANT ||
+                          SrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF))
               TargetExtConstTypes[SrcMI] = Const->getType();
+            if (Const->isNullValue()) {
+              MachineIRBuilder MIB(MF);
+              SPIRVType *ExtType =
+                  GR->getOrCreateSPIRVType(Const->getType(), MIB);
+              SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
+              SrcMI->addOperand(MachineOperand::CreateReg(
+                  GR->getSPIRVTypeID(ExtType), false));
+            }
           }
         } else {
           RegsAlreadyAddedToDT[&MI] = Reg;
@@ -394,6 +404,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
     for (auto MII = std::prev(MBB->end()), Begin = MBB->begin();
          !ReachedBegin;) {
       MachineInstr &MI = *MII;
+      unsigned MIOp = MI.getOpcode();
 
       if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) {
         Register Reg = MI.getOperand(1).getReg();
@@ -419,9 +430,9 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
           insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
         ToErase.push_back(&MI);
-      } else if (MI.getOpcode() == TargetOpcode::G_CONSTANT ||
-                 MI.getOpcode() == TargetOpcode::G_FCONSTANT ||
-                 MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+      } else if (MIOp == TargetOpcode::G_CONSTANT ||
+                 MIOp == TargetOpcode::G_FCONSTANT ||
+                 MIOp == TargetOpcode::G_BUILD_VECTOR) {
         // %rc = G_CONSTANT ty Val
         // ===>
         // %cty = OpType* ty
@@ -435,15 +446,15 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             continue;
         }
         Type *Ty = nullptr;
-        if (MI.getOpcode() == TargetOpcode::G_CONSTANT) {
+        if (MIOp == TargetOpcode::G_CONSTANT) {
           auto TargetExtIt = TargetExtConstTypes.find(&MI);
           Ty = TargetExtIt == TargetExtConstTypes.end()
                    ? MI.getOperand(1).getCImm()->getType()
                    : TargetExtIt->second;
-        } else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) {
+        } else if (MIOp == TargetOpcode::G_FCONSTANT) {
           Ty = MI.getOperand(1).getFPImm()->getType();
         } else {
-          assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+          assert(MIOp == TargetOpcode::G_BUILD_VECTOR);
           Type *ElemTy = nullptr;
           MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg());
           assert(ElemMI);
@@ -459,7 +470,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           Ty = VectorType::get(ElemTy, NumElts, false);
         }
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
-      } else if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
+      } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
       }
 
@@ -802,7 +813,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   MachineIRBuilder MIB(MF);
   // a registry of target extension constants
   DenseMap<MachineInstr *, Type *> TargetExtConstTypes;
-  addConstantsToTrack(MF, GR, TargetExtConstTypes);
+  addConstantsToTrack(MF, GR, ST, TargetExtConstTypes);
   foldConstantsIntoIntrinsics(MF);
   insertBitcasts(MF, GR, MIB);
   generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes);
diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
new file mode 100644
index 00000000000000..b40456d233f12f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
@@ -0,0 +1,23 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK: %[[#EventTy:]] = OpTypeEvent
+; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
+; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
+; CHECK: OpFunction
+; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]]
+; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]]
+
+
+define weak_odr dso_local spir_kernel void @foo(i64 %_arg_i, ptr addrspace(1) %_arg_ptr, ptr addrspace(3) %_arg_local) {
+entry:
+  %r1 = icmp ne i64 %_arg_i, 0
+  %e1 = tail call spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32 2, ptr addrspace(3) %_arg_local, ptr addrspace(1) %_arg_ptr, i64 1, i64 1, target("spirv.Event") zeroinitializer)
+  ret void
+}
+
+declare dso_local spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32, ptr addrspace(3), ptr addrspace(1), i64, i64, target("spirv.Event"))

From 42a0fb2333344077dc8aafd65b50d0ece886cf4e Mon Sep 17 00:00:00 2001
From: zjgarvey <47986913+zjgarvey@users.noreply.github.com>
Date: Wed, 29 May 2024 05:55:05 -0500
Subject: [PATCH 108/230] [mlir][linalg] Add linalg.conv_2d_ngchw_gfchw_q to
 named ops (#92136)

Adds a named op: linalg.conv_2d_ngchw_gfchw_q. This op is similar to
linalg.conv_2d_ngchw_gfchw, but additionally incorporates zero point
offset corrections.
---
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   | 138 ++++++++++++++++++
 .../linalg/opdsl/ops/core_named_ops.py        |  35 +++++
 .../Dialect/Linalg/generalize-named-ops.mlir  |  31 ++++
 mlir/test/Dialect/Linalg/named-ops.mlir       |  15 ++
 4 files changed, 219 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index eb7dd37010a672..fad234a9dcae9c 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3478,6 +3478,144 @@ structured_op: !LinalgStructuredOpConfig
                 - !ScalarExpression
                   scalar_arg: K
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_ngchw_gfchw_q
+  cpp_class_name: Conv2DNgchwGfchwQOp
+  doc: |-
+    Performs 2-D grouped convolution with zero-point offsets.
+
+    Layout:
+      * Input: NGCHW.
+      * Kernel: GFCHW.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s2, s3 * s4 + s5 * s6, s7 * s8 + s9 * s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s1, s11, s2, s5, s9)>
+  - !LinalgOperandDefConfig
+    name: IZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: KZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s11, s3, s7)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s6, s10)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d5, d3 * s4 + d6 * s6, d4 * s8 + d7 * s10)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d1, d2, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: IZp
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: K
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: KZp
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: conv_3d_ndhwc_dhwcf
   cpp_class_name: Conv3DNdhwcDhwcfOp
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index d73428a0f4df3b..43410aaa6af1be 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -958,6 +958,41 @@ def conv_2d_ngchw_gfchw(
     ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw])
 
 
+@linalg_structured_op
+def conv_2d_ngchw_gfchw_q(
+    I=TensorDef(
+        T1, S.N, S.G, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.C, S.KH, S.KW),
+    IZp=ScalarDef(I32),
+    KZp=ScalarDef(I32),
+    O=TensorDef(U, S.N, S.G, S.FG, S.OH, S.OW, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution with zero-point offsets.
+
+    Layout:
+      * Input: NGCHW.
+      * Kernel: GFCHW.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.g, D.fg, D.oh, D.ow, D.c, D.kh, D.kw)
+    O[D.n, D.g, D.fg, D.oh, D.ow] += (
+        TypeFn.cast_signed(
+            U, I[D.n, D.g, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW]
+        )
+        - TypeFn.cast_signed(U, IZp)
+    ) * (
+        TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw])
+        - TypeFn.cast_signed(U, KZp)
+    )
+
+
 @linalg_structured_op
 def conv_3d_ndhwc_dhwcf(
     I=TensorDef(
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index 4f43ec2c9e1cee..31fac9b4b41659 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -204,6 +204,37 @@ func.func @conv_1d_ncw_fcw(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>
 
 // -----
 
+func.func @conv_2d_ngchw_gfchw_q(%input: memref<?x?x?x?x?xi8>, %filter: memref<?x?x?x?x?xi8>, %inputzp: i32, %filterzp: i32, %output: memref<?x?x?x?x?xi32>) {
+  linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>,
+                                       strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter, %inputzp, %filterzp: memref<?x?x?x?x?xi8>, memref<?x?x?x?x?xi8>, i32, i32)
+    outs (%output: memref<?x?x?x?x?xi32>)
+  return
+}
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d5, d3 + d6, d4 + d7)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d2, d5, d6, d7)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> ()>
+// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4)>
+
+// CHECK: func @conv_2d_ngchw_gfchw_q
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP2]], #[[MAP3]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]}
+// CHECK-SAME: ins(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : memref<?x?x?x?x?xi8>, memref<?x?x?x?x?xi8>, i32, i32)
+// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?x?xi32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: i8, %[[BBARG1:.+]]: i8, %[[BBARG2:.+]]: i32, %[[BBARG3:.+]]: i32, %[[BBARG4:.+]]: i32)
+// CHECK-NEXT:      %[[EXTSI0:.+]] = arith.extsi %[[BBARG0]] : i8 to i32
+// CHECK-NEXT:      %[[SUB0:.+]] = arith.subi %[[EXTSI0]], %[[BBARG2]] : i32
+// CHECK-NEXT:      %[[EXTSI1:.+]] = arith.extsi %[[BBARG1]] : i8 to i32
+// CHECK-NEXT:      %[[SUB1:.+]] = arith.subi %[[EXTSI1]], %[[BBARG3]] : i32
+// CHECK-NEXT:      %[[MUL:.+]] = arith.muli %[[SUB0]], %[[SUB1]] : i32
+// CHECK-NEXT:      %[[ADD:.+]] = arith.addi %[[BBARG4]], %[[MUL]] : i32
+// CHECK-NEXT:      linalg.yield %[[ADD]] : i32
+
+// -----
+
 func.func @generalize_fill(%output: memref<?x?xf32>, %value : f32) {
   linalg.fill ins(%value : f32) outs(%output : memref<?x?xf32>)
   return
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 051054e67edf09..02ecbed232c8b5 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -441,6 +441,21 @@ func.func @conv_2d_ngchw_gfchw(%input: tensor<1x5x3x32x32xf32>, %filter: tensor<
 
 // -----
 
+// CHECK-LABEL: func @conv_2d_ngchw_gfchw_q
+func.func @conv_2d_ngchw_gfchw_q(%input: tensor<1x5x3x32x32xi8>, %filter: tensor<5x2x3x3x3xi8>, %inputzp: i32, %filterzp: i32, %init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32> {
+  // CHECK:      linalg.conv_2d_ngchw_gfchw_q
+  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32)
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32>
+  %0 = linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>,
+                                         strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter, %inputzp, %filterzp: tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32)
+    outs (%init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32>
+  return %0 : tensor<1x5x2x30x30xi32>
+}
+// -----
+
 // CHECK-LABEL: func @conv_3d_ndhwc_dhwcf
 func.func @conv_3d_ndhwc_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
   // CHECK:      %{{.+}} = linalg.conv_3d_ndhwc_dhwcf

From a860e89028a004bc5b46ce0952b75d4f85a5927d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 29 May 2024 11:58:09 +0100
Subject: [PATCH 109/230] [RISCV] Don't recompute getDemanded in
 RISCVInsertVSETVLI::needVSETVLI. NFC

This also makes the function a bit easier to reason about since we can
remove the assert.  Eventually we might be able to replace needVSETVLI
with VSETVLIInfo::isCompatible.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index c0b2a695b8ea45..2c0a807e446856 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -882,7 +882,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   StringRef getPassName() const override { return RISCV_INSERT_VSETVLI_NAME; }
 
 private:
-  bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
+  bool needVSETVLI(const DemandedFields &Used, const VSETVLIInfo &Require,
                    const VSETVLIInfo &CurInfo) const;
   bool needVSETVLIPHI(const VSETVLIInfo &Require,
                       const MachineBasicBlock &MBB) const;
@@ -1175,17 +1175,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
 }
 
 /// Return true if a VSETVLI is required to transition from CurInfo to Require
-/// before MI.
-bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
+/// given a set of DemandedFields \p Used.
+bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
                                      const VSETVLIInfo &Require,
                                      const VSETVLIInfo &CurInfo) const {
-  assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, LIS));
-
   if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
     return true;
 
-  DemandedFields Used = getDemanded(MI, ST);
-
   if (CurInfo.isCompatible(Used, Require, LIS))
     return false;
 
@@ -1232,16 +1228,17 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   if (!RISCVII::hasSEWOp(TSFlags))
     return;
 
+  DemandedFields Demanded = getDemanded(MI, ST);
+
   const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, LIS);
   assert(NewInfo.isValid() && !NewInfo.isUnknown());
-  if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
+  if (Info.isValid() && !needVSETVLI(Demanded, NewInfo, Info))
     return;
 
   const VSETVLIInfo PrevInfo = Info;
   if (!Info.isValid() || Info.isUnknown())
     Info = NewInfo;
 
-  DemandedFields Demanded = getDemanded(MI, ST);
   const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded);
 
   // If MI only demands that VL has the same zeroness, we only need to set the

From 7ee511217b0d1cfd3269e9d2a89acf335ca9a9ea Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 29 May 2024 07:40:52 -0400
Subject: [PATCH 110/230] [gn build] Port 04f01a2b9ced

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 6bd56dd4117b03..8a5f6d1908784b 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -289,7 +289,6 @@ if (current_toolchain == default_toolchain) {
       "__atomic/kill_dependency.h",
       "__atomic/memory_order.h",
       "__atomic/to_gcc_order.h",
-      "__availability",
       "__bit/bit_cast.h",
       "__bit/bit_ceil.h",
       "__bit/bit_floor.h",
@@ -385,7 +384,9 @@ if (current_toolchain == default_toolchain) {
       "__condition_variable/condition_variable.h",
       "__config",
       "__configuration/abi.h",
+      "__configuration/availability.h",
       "__configuration/compiler.h",
+      "__configuration/language.h",
       "__configuration/platform.h",
       "__coroutine/coroutine_handle.h",
       "__coroutine/coroutine_traits.h",

From 9c4bae7c7c5be754f98bc495d51dd122609cd649 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 29 May 2024 19:40:45 +0800
Subject: [PATCH 111/230] [X86][CodeGen] Disable NDD2NonNDD compression for
 CFCMOV

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp       |  3 ++-
 llvm/test/CodeGen/X86/apx/compress-evex.mir   | 19 +++++++++++++++++++
 .../TableGen/X86ManualCompressEVEXTables.def  |  8 ++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index cadfda93d4b196..11b2155e3f985d 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -181,7 +181,8 @@ static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
   const MCInstrDesc &Desc = MI.getDesc();
   Register Reg0 = MI.getOperand(0).getReg();
   const MachineOperand &Op1 = MI.getOperand(1);
-  if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1)
+  if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
+      X86::isCFCMOVCC(MI.getOpcode()))
     return false;
   Register Reg1 = Op1.getReg();
   if (Reg1 == Reg0)
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index 626904a7a692c1..5a59ab0f8a9d0b 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -108,3 +108,22 @@ body:             |
     $rax = ADC64rr_ND $r16, $rdi, implicit-def dead $eflags, implicit $eflags
     RET64 $rax
 ...
+---
+name:            cfcmov_no_convert
+body:             |
+  bb.0.entry:
+    liveins: $eflags, $rax, $rbx
+    ; CHECK: cfcmovew %bx, %ax, %ax                  # encoding: [0x62,0xf4,0x7d,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsw 24(%rax), %bx, %bx             # encoding: [0x62,0xf4,0x65,0x1c,0x48,0x58,0x18]
+    ; CHECK: cfcmovel %ebx, %eax, %eax               # encoding: [0x62,0xf4,0x7c,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsl 24(%rax), %ebx, %ebx           # encoding: [0x62,0xf4,0x64,0x1c,0x48,0x58,0x18]
+    ; CHECK: cfcmoveq %rbx, %rax, %rax               # encoding: [0x62,0xf4,0xfc,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsq 24(%rax), %rbx, %rbx           # encoding: [0x62,0xf4,0xe4,0x1c,0x48,0x58,0x18]
+    $ax = CFCMOV16rr_ND $ax, $bx, 4, implicit $eflags
+    $bx = CFCMOV16rm_ND $bx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    $eax = CFCMOV32rr_ND $eax, $ebx, 4, implicit $eflags
+    $ebx = CFCMOV32rm_ND $ebx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    $rax = CFCMOV64rr_ND $rax, $rbx, 4, implicit $eflags
+    $rbx = CFCMOV64rm_ND $rbx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    RET64 $rax
+...
diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
index 665a394f57a6af..cab601bf8131f6 100644
--- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
+++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
@@ -48,6 +48,14 @@ NOCOMP(VPSRAQZ256ri)
 NOCOMP(VPSRAQZ256rm)
 NOCOMP(VPSRAQZ256rr)
 NOCOMP(VSCALEFPSZ256rm)
+// When condition evaluates to false, the destination register is zeroed for
+// nonNDD CFCMOV but not for NDD CFCMOV.
+NOCOMP(CFCMOV16rm_ND)
+NOCOMP(CFCMOV16rr_ND)
+NOCOMP(CFCMOV32rm_ND)
+NOCOMP(CFCMOV32rr_ND)
+NOCOMP(CFCMOV64rm_ND)
+NOCOMP(CFCMOV64rr_ND)
 #undef NOCOMP
 
 #ifndef ENTRY

From 35f2caf713489049cc1b31aa3fe0a054968f80e3 Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Wed, 29 May 2024 13:14:03 +0100
Subject: [PATCH 112/230] [AArch64][GlobalISel] Select TBL/TBX Intrinsics
 (#92914)

---
 .../GISel/AArch64InstructionSelector.cpp      |   45 +
 llvm/test/CodeGen/AArch64/arm64-tbl.ll        | 1373 ++++++++++++-----
 2 files changed, 1069 insertions(+), 349 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3b3c1fc8b27bf1..4a7c82b393c10e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -227,6 +227,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
+  void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
+                   unsigned Opc1, unsigned Opc2, bool isExt);
 
   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -6537,6 +6539,25 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case Intrinsic::aarch64_neon_tbl2:
+    SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl3:
+    SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
+                false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl4:
+    SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbx2:
+    SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx3:
+    SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx4:
+    SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
+    return true;
   case Intrinsic::swift_async_context_addr:
     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
                               {Register(AArch64::FP)})
@@ -6552,6 +6573,30 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
   return false;
 }
 
+void AArch64InstructionSelector::SelectTable(MachineInstr &I,
+                                             MachineRegisterInfo &MRI,
+                                             unsigned NumVec, unsigned Opc1,
+                                             unsigned Opc2, bool isExt) {
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
+
+  // Create the REG_SEQUENCE
+  SmallVector<Register, 4> Regs;
+  for (unsigned i = 0; i < NumVec; i++)
+    Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
+  Register RegSeq = createQTuple(Regs, MIB);
+
+  Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
+  MachineInstrBuilder Instr;
+  if (isExt) {
+    Register Reg = I.getOperand(2).getReg();
+    Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
+  } else
+    Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
+  constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
+  I.eraseFromParent();
+}
+
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
   auto MaybeImmed = getImmedFromMO(Root);
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 96b2af7274b5bf..44b92e6ccd088f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -1,28 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for tbl2_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl2_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl3_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbl4_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx2_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx3_16b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_8b
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for tbx4_16b
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
 ; CHECK-LABEL: tbl1_8b:
@@ -43,175 +21,378 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
 }
 
 define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK-LABEL: tbl2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK-LABEL: tbl2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbl3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbl3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbl4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbl4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
-; CHECK-LABEL: .LCPI8_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
+; CHECK-SD-LABEL: .LCPI8_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI8_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   13                              // 0xd
+; CHECK-GI-NEXT:         .byte   14                              // 0xe
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-LABEL: .LCPI8_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v4
-; CHECK-NEXT:    tbl.8b v1, { v2, v3 }, v4
-; CHECK-NEXT:    mov.s v0[1], v1[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-SD-NEXT:    mov.s v0[1], v1[1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr d4, [x8, :lo12:.LCPI8_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI9_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    16                              // 0x10
-; CHECK-NEXT:     .byte    20                              // 0x14
-; CHECK-NEXT:     .byte    24                              // 0x18
-; CHECK-NEXT:     .byte    28                              // 0x1c
-; CHECK-NEXT:     .byte   32                              // 0x20
-; CHECK-NEXT:     .byte   36                              // 0x24
-; CHECK-NEXT:     .byte   40                              // 0x28
-; CHECK-NEXT:     .byte   44                              // 0x2c
-; CHECK-NEXT:     .byte   48                              // 0x30
-; CHECK-NEXT:     .byte   52                              // 0x34
-; CHECK-NEXT:     .byte   56                              // 0x38
-; CHECK-NEXT:     .byte   60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI9_0:
+; CHECK-SD-NEXT:     .byte    0                               // 0x0
+; CHECK-SD-NEXT:     .byte    4                               // 0x4
+; CHECK-SD-NEXT:     .byte    8                               // 0x8
+; CHECK-SD-NEXT:     .byte    12                              // 0xc
+; CHECK-SD-NEXT:     .byte    16                              // 0x10
+; CHECK-SD-NEXT:     .byte    20                              // 0x14
+; CHECK-SD-NEXT:     .byte    24                              // 0x18
+; CHECK-SD-NEXT:     .byte    28                              // 0x1c
+; CHECK-SD-NEXT:     .byte   32                              // 0x20
+; CHECK-SD-NEXT:     .byte   36                              // 0x24
+; CHECK-SD-NEXT:     .byte   40                              // 0x28
+; CHECK-SD-NEXT:     .byte   44                              // 0x2c
+; CHECK-SD-NEXT:     .byte   48                              // 0x30
+; CHECK-SD-NEXT:     .byte   52                              // 0x34
+; CHECK-SD-NEXT:     .byte   56                              // 0x38
+; CHECK-SD-NEXT:     .byte   60                              // 0x3c
+
+;CHECK-GI-LABEL: .LCPI9_0:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   1                               // 0x1
+;CHECK-GI-NEXT:         .byte   2                               // 0x2
+;CHECK-GI-NEXT:         .byte   3                               // 0x3
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   5                               // 0x5
+;CHECK-GI-NEXT:         .byte   6                               // 0x6
+;CHECK-GI-NEXT:         .byte   7                               // 0x7
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   17                              // 0x11
+;CHECK-GI-NEXT:         .byte   18                              // 0x12
+;CHECK-GI-NEXT:         .byte   19                              // 0x13
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   21                              // 0x15
+;CHECK-GI-NEXT:         .byte   22                              // 0x16
+;CHECK-GI-NEXT:         .byte   23                              // 0x17
+;CHECK-GI-LABEL: .LCPI9_1:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   8                               // 0x8
+;CHECK-GI-NEXT:         .byte   12                              // 0xc
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   24                              // 0x18
+;CHECK-GI-NEXT:         .byte   28                              // 0x1c
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI9_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI9_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI10_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI10_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #60 // =0x3c
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s4, w0
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #60 // =0x3c
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -234,40 +415,111 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI11_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI11_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w8
-; CHECK-NEXT:    mov.b v4[2], w8
-; CHECK-NEXT:    mov.b v4[3], w8
-; CHECK-NEXT:    mov.b v4[4], w8
-; CHECK-NEXT:    mov.b v4[5], w8
-; CHECK-NEXT:    mov.b v4[6], w8
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    fmov s4, w8
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w8
+; CHECK-SD-NEXT:    mov.b v4[2], w8
+; CHECK-SD-NEXT:    mov.b v4[3], w8
+; CHECK-SD-NEXT:    mov.b v4[4], w8
+; CHECK-SD-NEXT:    mov.b v4[5], w8
+; CHECK-SD-NEXT:    mov.b v4[6], w8
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #31 // =0x1f
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s6, w0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
+; CHECK-GI-NEXT:    mov.b v5[15], v6[0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
@@ -290,29 +542,116 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI12_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI12_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI12_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v4, #0xffffffffffffffff
-; CHECK-NEXT:    adrp x8, .LCPI12_0
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    mov.b v4[0], w0
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT:    mov.d v2[1], v0[0]
-; CHECK-NEXT:    mov.16b v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.2d v4, #0xffffffffffffffff
+; CHECK-SD-NEXT:    adrp x8, .LCPI12_0
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    mov.b v4[0], w0
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    mov.d v2[1], v0[0]
+; CHECK-SD-NEXT:    mov.16b v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -335,29 +674,133 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI13_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-LABEL: .LCPI13_1:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   1                               // 0x1
+; CHECK-SD-NEXT:         .byte   2                               // 0x2
+; CHECK-SD-NEXT:         .byte   3                               // 0x3
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   5                               // 0x5
+; CHECK-SD-NEXT:         .byte   6                               // 0x6
+; CHECK-SD-NEXT:         .byte   7                               // 0x7
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   17                              // 0x11
+; CHECK-SD-NEXT:         .byte   18                              // 0x12
+; CHECK-SD-NEXT:         .byte   19                              // 0x13
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   21                              // 0x15
+; CHECK-SD-NEXT:         .byte   30                              // 0x1e
+; CHECK-SD-NEXT:         .byte   31                              // 0x1f
+
+; CHECK-GI-LABEL: .LCPI13_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   30                              // 0x1e
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI13_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup.16b v4, w0
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    adrp x8, .LCPI13_0
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    adrp x8, .LCPI13_1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    tbl.16b v3, { v0, v1 }, v4
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
-; CHECK-NEXT:    tbl.16b v0, { v2, v3 }, v0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup.16b v4, w0
+; CHECK-SD-NEXT:    mov w8, #255 // =0xff
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_0
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    tbl.16b v3, { v0, v1 }, v4
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
+; CHECK-SD-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    fmov s6, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[8], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -380,106 +823,293 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI14_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
 
-; CHECK-LABEL: .LCPI14_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-GI-LABEL: .LCPI14_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI14_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI14_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI15_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI15_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI15_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI15_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI15_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI15_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI15_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI16_0:
-; CHECK-NEXT: 	.byte	0                               // 0x0
-; CHECK-NEXT: 	.byte	4                               // 0x4
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	12                              // 0xc
-; CHECK-NEXT: 	.byte	16                              // 0x10
-; CHECK-NEXT: 	.byte	20                              // 0x14
-; CHECK-NEXT: 	.byte	24                              // 0x18
-; CHECK-NEXT: 	.byte	28                              // 0x1c
-; CHECK-NEXT: 	.byte	32                              // 0x20
-; CHECK-NEXT: 	.byte	36                              // 0x24
-; CHECK-NEXT: 	.byte	40                              // 0x28
-; CHECK-NEXT: 	.byte	44                              // 0x2c
-; CHECK-NEXT: 	.byte	48                              // 0x30
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	56                              // 0x38
-; CHECK-NEXT: 	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI16_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI16_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI16_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI16_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI16_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI16_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -514,73 +1144,121 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
 }
 
 define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbx2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.8b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbx2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.16b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbx3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbx3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK-LABEL: tbx4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK-LABEL: tbx4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
   ret <16 x i8> %tmp3
 }
@@ -594,6 +1272,3 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>,
 declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
 declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}

From 8e1290432adf33a7aeca65a53d1faa7577ed0e66 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 29 May 2024 14:19:49 +0200
Subject: [PATCH 113/230] [lldb/DWARF] Refactor
 DWARFDIE::Get{Decl,TypeLookup}Context (#93291)

After a bug (the bug is that the functions don't handle DW_AT_signature,
aka type units) led me to one of these similar-but-different functions,
I started to realize that most of the differences between these two
functions are actually bugs.

As a first step towards merging them, this patch rewrites both of them
to follow the same pattern, while preserving all of their differences.
The main change is that GetTypeLookupContext now also uses a `seen` list
to avoid reference loops (currently that's not necessary because the
function strictly follows parent links, but that will change with
DW_AT_signatures).

I've also optimized both functions to avoid recursion by starting contruction
with the deepest scope first (and then reversing it).
---
 lldb/include/lldb/Symbol/Type.h               |   2 +
 .../Plugins/SymbolFile/DWARF/DWARFDIE.cpp     | 197 +++++++++---------
 lldb/source/Symbol/Type.cpp                   |   7 +
 .../SymbolFile/DWARF/DWARFDIETest.cpp         |  71 +++++++
 4 files changed, 184 insertions(+), 93 deletions(-)

diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index 7aa0852676e465..c6f30cde818674 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -62,6 +62,8 @@ struct CompilerContext {
   CompilerContextKind kind;
   ConstString name;
 };
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const CompilerContext &rhs);
 
 /// Match \p context_chain against \p pattern, which may contain "Any"
 /// kinds. The \p context_chain should *not* contain any "Any" kinds.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 4884374ef94729..03e289bbf33005 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -13,6 +13,7 @@
 #include "DWARFDebugInfoEntry.h"
 #include "DWARFDeclContext.h"
 #include "DWARFUnit.h"
+#include "lldb/Symbol/Type.h"
 
 #include "llvm/ADT/iterator.h"
 
@@ -379,108 +380,118 @@ std::vector<DWARFDIE> DWARFDIE::GetDeclContextDIEs() const {
   return result;
 }
 
-static std::vector<lldb_private::CompilerContext>
-GetDeclContextImpl(llvm::SmallSet<lldb::user_id_t, 4> &seen, DWARFDIE die) {
-  std::vector<lldb_private::CompilerContext> context;
+static void GetDeclContextImpl(DWARFDIE die,
+                               llvm::SmallSet<lldb::user_id_t, 4> &seen,
+                               std::vector<CompilerContext> &context) {
   // Stop if we hit a cycle.
-  if (!die || !seen.insert(die.GetID()).second)
-    return context;
-
-  // Handle outline member function DIEs by following the specification.
-  if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification))
-    return GetDeclContextImpl(seen, spec);
-
-  // Get the parent context chain.
-  context = GetDeclContextImpl(seen, die.GetParent());
+  while (die && seen.insert(die.GetID()).second) {
+    // Handle outline member function DIEs by following the specification.
+    if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification)) {
+      die = spec;
+      continue;
+    }
 
-  // Add this DIE's contribution at the end of the chain.
-  auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-    context.push_back({kind, ConstString(name)});
-  };
-  switch (die.Tag()) {
-  case DW_TAG_module:
-    push_ctx(CompilerContextKind::Module, die.GetName());
-    break;
-  case DW_TAG_namespace:
-    push_ctx(CompilerContextKind::Namespace, die.GetName());
-    break;
-  case DW_TAG_structure_type:
-    push_ctx(CompilerContextKind::Struct, die.GetName());
-    break;
-  case DW_TAG_union_type:
-    push_ctx(CompilerContextKind::Union, die.GetName());
-    break;
-  case DW_TAG_class_type:
-    push_ctx(CompilerContextKind::Class, die.GetName());
-    break;
-  case DW_TAG_enumeration_type:
-    push_ctx(CompilerContextKind::Enum, die.GetName());
-    break;
-  case DW_TAG_subprogram:
-    push_ctx(CompilerContextKind::Function, die.GetName());
-    break;
-  case DW_TAG_variable:
-    push_ctx(CompilerContextKind::Variable, die.GetPubname());
-    break;
-  case DW_TAG_typedef:
-    push_ctx(CompilerContextKind::Typedef, die.GetName());
-    break;
-  default:
-    break;
+    // Add this DIE's contribution at the end of the chain.
+    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
+      context.push_back({kind, ConstString(name)});
+    };
+    switch (die.Tag()) {
+    case DW_TAG_module:
+      push_ctx(CompilerContextKind::Module, die.GetName());
+      break;
+    case DW_TAG_namespace:
+      push_ctx(CompilerContextKind::Namespace, die.GetName());
+      break;
+    case DW_TAG_structure_type:
+      push_ctx(CompilerContextKind::Struct, die.GetName());
+      break;
+    case DW_TAG_union_type:
+      push_ctx(CompilerContextKind::Union, die.GetName());
+      break;
+    case DW_TAG_class_type:
+      push_ctx(CompilerContextKind::Class, die.GetName());
+      break;
+    case DW_TAG_enumeration_type:
+      push_ctx(CompilerContextKind::Enum, die.GetName());
+      break;
+    case DW_TAG_subprogram:
+      push_ctx(CompilerContextKind::Function, die.GetName());
+      break;
+    case DW_TAG_variable:
+      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      break;
+    case DW_TAG_typedef:
+      push_ctx(CompilerContextKind::Typedef, die.GetName());
+      break;
+    default:
+      break;
+    }
+    // Now process the parent.
+    die = die.GetParent();
   }
-  return context;
 }
 
-std::vector<lldb_private::CompilerContext> DWARFDIE::GetDeclContext() const {
+std::vector<CompilerContext> DWARFDIE::GetDeclContext() const {
   llvm::SmallSet<lldb::user_id_t, 4> seen;
-  return GetDeclContextImpl(seen, *this);
+  std::vector<CompilerContext> context;
+  GetDeclContextImpl(*this, seen, context);
+  std::reverse(context.begin(), context.end());
+  return context;
 }
 
-std::vector<lldb_private::CompilerContext>
-DWARFDIE::GetTypeLookupContext() const {
-  std::vector<lldb_private::CompilerContext> context;
-  // If there is no name, then there is no need to look anything up for this
-  // DIE.
-  const char *name = GetName();
-  if (!name || !name[0])
-    return context;
-  const dw_tag_t tag = Tag();
-  if (tag == DW_TAG_compile_unit || tag == DW_TAG_partial_unit)
-    return context;
-  DWARFDIE parent = GetParent();
-  if (parent)
-    context = parent.GetTypeLookupContext();
-  auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-    context.push_back({kind, ConstString(name)});
-  };
-  switch (tag) {
-  case DW_TAG_namespace:
-    push_ctx(CompilerContextKind::Namespace, name);
-    break;
-  case DW_TAG_structure_type:
-    push_ctx(CompilerContextKind::Struct, name);
-    break;
-  case DW_TAG_union_type:
-    push_ctx(CompilerContextKind::Union, name);
-    break;
-  case DW_TAG_class_type:
-    push_ctx(CompilerContextKind::Class, name);
-    break;
-  case DW_TAG_enumeration_type:
-    push_ctx(CompilerContextKind::Enum, name);
-    break;
-  case DW_TAG_variable:
-    push_ctx(CompilerContextKind::Variable, GetPubname());
-    break;
-  case DW_TAG_typedef:
-    push_ctx(CompilerContextKind::Typedef, name);
-    break;
-  case DW_TAG_base_type:
-    push_ctx(CompilerContextKind::Builtin, name);
-    break;
-  default:
-    break;
+static void GetTypeLookupContextImpl(DWARFDIE die,
+                                     llvm::SmallSet<lldb::user_id_t, 4> &seen,
+                                     std::vector<CompilerContext> &context) {
+  // Stop if we hit a cycle.
+  while (die && seen.insert(die.GetID()).second) {
+    // If there is no name, then there is no need to look anything up for this
+    // DIE.
+    const char *name = die.GetName();
+    if (!name || !name[0])
+      return;
+
+    // Add this DIE's contribution at the end of the chain.
+    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
+      context.push_back({kind, ConstString(name)});
+    };
+    switch (die.Tag()) {
+    case DW_TAG_namespace:
+      push_ctx(CompilerContextKind::Namespace, die.GetName());
+      break;
+    case DW_TAG_structure_type:
+      push_ctx(CompilerContextKind::Struct, die.GetName());
+      break;
+    case DW_TAG_union_type:
+      push_ctx(CompilerContextKind::Union, die.GetName());
+      break;
+    case DW_TAG_class_type:
+      push_ctx(CompilerContextKind::Class, die.GetName());
+      break;
+    case DW_TAG_enumeration_type:
+      push_ctx(CompilerContextKind::Enum, die.GetName());
+      break;
+    case DW_TAG_variable:
+      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      break;
+    case DW_TAG_typedef:
+      push_ctx(CompilerContextKind::Typedef, die.GetName());
+      break;
+    case DW_TAG_base_type:
+      push_ctx(CompilerContextKind::Builtin, name);
+      break;
+    default:
+      break;
+    }
+    // Now process the parent.
+    die = die.GetParent();
   }
+}
+
+std::vector<CompilerContext> DWARFDIE::GetTypeLookupContext() const {
+  llvm::SmallSet<lldb::user_id_t, 4> seen;
+  std::vector<CompilerContext> context;
+  GetTypeLookupContextImpl(*this, seen, context);
+  std::reverse(context.begin(), context.end());
   return context;
 }
 
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 6bf69c2ded2874..585808ace15ce8 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -36,6 +36,13 @@
 using namespace lldb;
 using namespace lldb_private;
 
+llvm::raw_ostream &lldb_private::operator<<(llvm::raw_ostream &os,
+                                            const CompilerContext &rhs) {
+  StreamString lldb_stream;
+  rhs.Dump(lldb_stream);
+  return os << lldb_stream.GetString();
+}
+
 bool lldb_private::contextMatches(llvm::ArrayRef<CompilerContext> context_chain,
                                   llvm::ArrayRef<CompilerContext> pattern) {
   auto ctx = context_chain.begin();
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 20742ea5123091..bea07dfa27cc6a 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -10,6 +10,8 @@
 #include "Plugins/SymbolFile/DWARF/DWARFDebugInfo.h"
 #include "TestingSupport/Symbol/YAMLModuleTester.h"
 #include "lldb/Core/dwarf.h"
+#include "lldb/Symbol/Type.h"
+#include "lldb/lldb-private-enumerations.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -187,3 +189,72 @@ TEST(DWARFDIETest, PeekName) {
   dw_offset_t fifth_die_offset = 26;
   EXPECT_EQ(unit->PeekDIEName(fifth_die_offset), "NameType2");
 }
+
+TEST(DWARFDIETest, GetContext) {
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_namespace
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x000000000000000C
+        - AbbrCode:        0x2
+          Values:
+            - CStr:            NAMESPACE
+        - AbbrCode:        0x3
+          Values:
+            - CStr:            STRUCT
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+)";
+
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0);
+  ASSERT_TRUE(unit);
+
+  auto make_namespace = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Namespace, ConstString(name));
+  };
+  auto make_struct = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Struct, ConstString(name));
+  };
+  DWARFDIE struct_die = unit->DIE().GetFirstChild().GetFirstChild();
+  ASSERT_TRUE(struct_die);
+  EXPECT_THAT(
+      struct_die.GetDeclContext(),
+      testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT")));
+  EXPECT_THAT(
+      struct_die.GetTypeLookupContext(),
+      testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT")));
+}

From 3ce9b86cfd2d88162bc4a551dd7910b8cff3097b Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Wed, 29 May 2024 12:23:02 +0000
Subject: [PATCH 114/230] [AArch64][NFC] Pre-commit Test for Combine
 MUL(AND(LSHR)) to CMLTz (#92915)

---
 llvm/test/CodeGen/AArch64/mulcmle.ll | 135 +++++++++++++++++++++------
 1 file changed, 105 insertions(+), 30 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll
index 5c216b85500801..b22c75259adf27 100644
--- a/llvm/test/CodeGen/AArch64/mulcmle.ll
+++ b/llvm/test/CodeGen/AArch64/mulcmle.ll
@@ -1,11 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 %s -o - -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <1 x i64> @v1i64(<1 x i64> %a) {
-; CHECK-LABEL: v1i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v1i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v1i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    lsr x8, x8, #31
+; CHECK-GI-NEXT:    and x8, x8, #0x100000001
+; CHECK-GI-NEXT:    lsl x9, x8, #32
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %b = lshr <1 x i64> %a, <i64 31>
   %c = and <1 x i64> %b, <i64 4294967297>
   %d = mul nuw <1 x i64> %c, <i64 4294967295>
@@ -13,10 +24,26 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
 }
 
 define <2 x i64> @v2i64(<2 x i64> %a) {
-; CHECK-LABEL: v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #31
+; CHECK-GI-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
   %b = lshr <2 x i64> %a, <i64 31, i64 31>
   %c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
   %d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
@@ -24,10 +51,19 @@ define <2 x i64> @v2i64(<2 x i64> %a) {
 }
 
 define <2 x i32> @v2i32(<2 x i32> %a) {
-; CHECK-LABEL: v2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.4h, #1
+; CHECK-GI-NEXT:    ushr v0.2s, v0.2s, #15
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    ret
   %b = lshr <2 x i32> %a, <i32 15, i32 15>
   %c = and <2 x i32> %b, <i32 65537, i32 65537>
   %d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
@@ -35,10 +71,19 @@ define <2 x i32> @v2i32(<2 x i32> %a) {
 }
 
 define <4 x i32> @v4i32(<4 x i32> %a) {
-; CHECK-LABEL: v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.8h, #1
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #15
+; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ret
   %b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
   %c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
   %d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -46,11 +91,23 @@ define <4 x i32> @v4i32(<4 x i32> %a) {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %a) {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.8h, #1
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #15
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #15
+; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   %c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
   %d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -58,10 +115,19 @@ define <8 x i32> @v8i32(<8 x i32> %a) {
 }
 
 define <4 x i16> @v4i16(<4 x i16> %a) {
-; CHECK-LABEL: v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.8b, #1
+; CHECK-GI-NEXT:    ushr v0.4h, v0.4h, #7
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ret
   %b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
   %c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
   %d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
@@ -69,10 +135,19 @@ define <4 x i16> @v4i16(<4 x i16> %a) {
 }
 
 define <8 x i16> @v8i16(<8 x i16> %a) {
-; CHECK-LABEL: v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.16b, #1
+; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #7
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ret
   %b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
   %d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

From 3082258d3a29664a66fcd35c104a40b8cf9d6cba Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Wed, 29 May 2024 14:42:48 +0200
Subject: [PATCH 115/230] [CodeGen][X86] Use TargetLowering for TypeInfo of
 PointerTy (#93469)

This uses the TargetLowering getSimpleValueType mechanism to retrieve
the ValueType info inside the X86 cost model.

This resolves a build issue we were seeing for the miniQMC application after
https://github.com/llvm/llvm-project/pull/92671.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  3 +-
 ...dle-iptr-with-data-layout-to-not-assert.ll | 35 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d935be7669f056..3b18e39d784b22 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6257,7 +6257,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
                                 AddressSpace, CostKind);
 
   unsigned VF = VecTy->getNumElements() / Factor;
-  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
+  MVT VT =
+      MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
 
   InstructionCost MaskCost;
   if (UseMaskedMemOp) {
diff --git a/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll
new file mode 100644
index 00000000000000..d0d414a869636b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9], ptr %__last" --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9]" --version 5
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @foo(ptr %__first, ptr %__last) #0 {
+; CHECK-LABEL: 'foo'
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 2 for VF 2 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 3 for VF 4 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 3 for VF 8 For instruction: store ptr %0, ptr %__last, align 8
+;
+entry:
+  %cmp.not1 = icmp eq ptr %__first, %__last
+  br i1 %cmp.not1, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %__first.addr.02 = phi ptr [ %incdec.ptr, %for.body ], [ %__first, %for.body.preheader ]
+  %0 = load ptr, ptr %__first.addr.02, align 8
+  store ptr %0, ptr %__last, align 8
+  %incdec.ptr = getelementptr inbounds i8, ptr %__first.addr.02, i64 16
+  %cmp.not = icmp eq ptr %incdec.ptr, %__last
+  br i1 %cmp.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret ptr null
+}
+
+attributes #0 = { "target-cpu"="znver4" }

From 103f6a7606fdc128041bb2e17fb0e992fc6f2225 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 29 May 2024 08:58:45 -0400
Subject: [PATCH 116/230] Reland "[gn] port 088aa81a5454 (LLVM_HAS_LOGF128)""

This reverts commit 9ebf2f8a67cce570d0752556fed23ff2803aef33.
088aa81a5454 relanded in 3613b2683107b.
---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 +
 llvm/utils/gn/secondary/llvm/test/BUILD.gn                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index e93130eacdc74b..d8266fee05014b 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -318,6 +318,7 @@ write_cmake_config("llvm-config") {
     "LLVM_ENABLE_ZSTD=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",
     "LLVM_HAS_ATOMICS=1",
+    "LLVM_HAS_LOGF128=",
     "LLVM_HAVE_TFLITE=",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_NATIVE_ARCH=$native_target",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 826dcf4e6ee9b1..60d6d7b8c3ce7a 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -64,6 +64,7 @@ write_lit_config("lit_site_cfg") {
     "LLVM_ENABLE_HTTPLIB=0",
     "LLVM_ENABLE_ZSTD=0",
     "LLVM_FORCE_VC_REVISION=",
+    "LLVM_HAS_LOGF128=0",
     "LLVM_HAVE_OPT_VIEWER_MODULES=0",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_INCLUDE_DXIL_TESTS=0",

From 9a282724a29899e84adc91bdeaf639010408a80d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 15:00:34 +0200
Subject: [PATCH 117/230] [Reassociate] Update test after recent change

Fix test expectation after 3bcccb6af685c3132a9ee578b9e11b2503c35a5c.
---
 llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
index fcedde23ecc7fd..bd0060cc5abbd9 100644
--- a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
+++ b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
@@ -57,13 +57,12 @@ define <8 x i1> @vector2(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2,
 ; CHECK-NEXT:    [[OR6:%.*]] = or <8 x i1> [[B6]], [[A]]
 ; CHECK-NEXT:    [[OR7:%.*]] = or <8 x i1> [[B7]], [[A]]
 ; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[OR1]], [[OR0]]
-; CHECK-NEXT:    [[XOR1:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]]
-; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR1]], [[OR3]]
-; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[XOR2]], [[OR4]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]]
+; CHECK-NEXT:    [[OR045:%.*]] = xor <8 x i1> [[XOR2]], [[OR3]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[OR045]], [[OR4]]
 ; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[XOR3]], [[OR5]]
 ; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[XOR4]], [[OR6]]
 ; CHECK-NEXT:    [[XOR6:%.*]] = xor <8 x i1> [[XOR5]], [[OR7]]
-; CHECK-NEXT:    [[OR045:%.*]] = or <8 x i1> [[XOR1]], [[XOR0]]
 ; CHECK-NEXT:    [[OR4560:%.*]] = or <8 x i1> [[OR045]], [[XOR2]]
 ; CHECK-NEXT:    [[OR023:%.*]] = or <8 x i1> [[OR4560]], [[XOR3]]
 ; CHECK-NEXT:    [[OR001:%.*]] = or <8 x i1> [[OR023]], [[XOR4]]

From 23366d4153e1e521a7e5b88d42afea69fb888be7 Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Wed, 29 May 2024 14:15:13 +0100
Subject: [PATCH 118/230]  [AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15),
 0x10001), 0xffff) to CMLTz (#92915)

This patch mirrors the following SelectionDAG patch for GlobalISel:
https://reviews.llvm.org/D130874
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  11 +-
 .../GISel/AArch64PostLegalizerCombiner.cpp    |  55 +++++++++
 llvm/test/CodeGen/AArch64/mulcmle.ll          | 114 ++++--------------
 3 files changed, 90 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1c7f6b870d3904..1ce6cdf1c1e1ed 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
   (apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
+def combine_mul_cmlt : GICombineRule<
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_MUL):$root,
+        [{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
@@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
                         commute_constant_to_rhs,
-                        push_freeze_to_prevent_poison_from_propagating]> {
+                        push_freeze_to_prevent_poison_from_propagating,
+                        combine_mul_cmlt]> {
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index d8ca5494ba50a4..7f3e0e01ccd25c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
+bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         Register &SrcReg) {
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
+      DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
+      DstTy != LLT::fixed_vector(8, 16))
+    return false;
+
+  auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  if (AndMI->getOpcode() != TargetOpcode::G_AND)
+    return false;
+  auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
+  if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
+    return false;
+
+  // Check the constant splat values
+  auto V1 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
+  auto V2 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
+  auto V3 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
+  if (!V1.has_value() || !V2.has_value() || !V3.has_value())
+    return false;
+  unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
+  if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return false;
+
+  SrcReg = LShrMI->getOperand(1).getReg();
+
+  return true;
+}
+
+void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B, Register &SrcReg) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT HalfTy =
+      DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
+          .changeElementSize(DstTy.getScalarSizeInBits() / 2);
+
+  Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
+  Register CastReg =
+      B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
+  Register CMLTReg =
+      B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
+          .getReg(0);
+
+  B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
+  MI.eraseFromParent();
+}
+
 class AArch64PostLegalizerCombinerImpl : public Combiner {
 protected:
   // TODO: Make CombinerHelper methods const.
diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll
index b22c75259adf27..32bc5c5e63b3e1 100644
--- a/llvm/test/CodeGen/AArch64/mulcmle.ll
+++ b/llvm/test/CodeGen/AArch64/mulcmle.ll
@@ -24,26 +24,10 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
 }
 
 define <2 x i64> @v2i64(<2 x i64> %a) {
-; CHECK-SD-LABEL: v2i64:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.4s, v0.4s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v2i64:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.4s, #1
-; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #31
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    fmov x11, d2
-; CHECK-GI-NEXT:    mov x9, v2.d[1]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    ret
   %b = lshr <2 x i64> %a, <i64 31, i64 31>
   %c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
   %d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
@@ -51,19 +35,10 @@ define <2 x i64> @v2i64(<2 x i64> %a) {
 }
 
 define <2 x i32> @v2i32(<2 x i32> %a) {
-; CHECK-SD-LABEL: v2i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.4h, v0.4h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v2i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.4h, #1
-; CHECK-GI-NEXT:    ushr v0.2s, v0.2s, #15
-; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    ret
   %b = lshr <2 x i32> %a, <i32 15, i32 15>
   %c = and <2 x i32> %b, <i32 65537, i32 65537>
   %d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
@@ -71,19 +46,10 @@ define <2 x i32> @v2i32(<2 x i32> %a) {
 }
 
 define <4 x i32> @v4i32(<4 x i32> %a) {
-; CHECK-SD-LABEL: v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.8h, #1
-; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #15
-; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    ret
   %b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
   %c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
   %d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -91,23 +57,11 @@ define <4 x i32> @v4i32(<4 x i32> %a) {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %a) {
-; CHECK-SD-LABEL: v8i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.8h, #1
-; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #15
-; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #15
-; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v3.4s
-; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    ret
   %b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   %c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
   %d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -115,19 +69,10 @@ define <8 x i32> @v8i32(<8 x i32> %a) {
 }
 
 define <4 x i16> @v4i16(<4 x i16> %a) {
-; CHECK-SD-LABEL: v4i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.8b, v0.8b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.8b, #1
-; CHECK-GI-NEXT:    ushr v0.4h, v0.4h, #7
-; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    ret
   %b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
   %c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
   %d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
@@ -135,19 +80,10 @@ define <4 x i16> @v4i16(<4 x i16> %a) {
 }
 
 define <8 x i16> @v8i16(<8 x i16> %a) {
-; CHECK-SD-LABEL: v8i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.16b, #1
-; CHECK-GI-NEXT:    ushr v0.8h, v0.8h, #7
-; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    ret
   %b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
   %d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

From 4ffe26334e563a3fea70c2a05de0410a2a3856d7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 15:23:59 +0200
Subject: [PATCH 119/230] [InstSimplify] Generate test checks (NFC)

---
 .../InstSimplify/ConstProp/vectorgep-crash.ll | 19 +++--
 .../Transforms/InstSimplify/vector_gep.ll     | 73 +++++++++++--------
 2 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
index 5f554501206206..00ee7f8a92b218 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s
 ; Tests that we don't crash upon encountering a vector GEP
 
@@ -23,17 +24,21 @@ top:
 %struct.C = type { i64 }
 
 @G = internal global [65 x %struct.A] zeroinitializer, align 16
-; CHECK-LABEL: @test
-; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
 define <16 x ptr> @test() {
+; CHECK-LABEL: define <16 x ptr> @test() {
+; CHECK-NEXT:  [[VECTOR_BODY:.*:]]
+; CHECK-NEXT:    ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
+;
 vector.body:
   %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, <16 x i32> zeroinitializer
   ret <16 x ptr> %VectorGep
 }
 
-; CHECK-LABEL: @test2
-; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9,
 define <16 x ptr> @test2() {
+; CHECK-LABEL: define <16 x ptr> @test2() {
+; CHECK-NEXT:  [[VECTOR_BODY:.*:]]
+; CHECK-NEXT:    ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
+;
 vector.body:
   %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i32> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, <16 x i32> zeroinitializer
   ret <16 x ptr> %VectorGep
@@ -42,7 +47,7 @@ vector.body:
 @g = external global i8, align 1
 
 define <2 x ptr> @constant_zero_index() {
-; CHECK-LABEL: @constant_zero_index(
+; CHECK-LABEL: define <2 x ptr> @constant_zero_index() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @g, ptr @g>
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> zeroinitializer
@@ -50,7 +55,7 @@ define <2 x ptr> @constant_zero_index() {
 }
 
 define <2 x ptr> @constant_undef_index() {
-; CHECK-LABEL: @constant_undef_index(
+; CHECK-LABEL: define <2 x ptr> @constant_undef_index() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @g, ptr @g>
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> undef
@@ -58,7 +63,7 @@ define <2 x ptr> @constant_undef_index() {
 }
 
 define <2 x ptr> @constant_inbounds() {
-; CHECK-LABEL: @constant_inbounds(
+; CHECK-LABEL: define <2 x ptr> @constant_inbounds() {
 ; CHECK-NEXT:    ret <2 x ptr> getelementptr inbounds (i8, ptr @g, <2 x i64> <i64 1, i64 1>)
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> <i64 1, i64 1>
diff --git a/llvm/test/Transforms/InstSimplify/vector_gep.ll b/llvm/test/Transforms/InstSimplify/vector_gep.ll
index ba0d978ed5b3cf..79aa9f13d1ea72 100644
--- a/llvm/test/Transforms/InstSimplify/vector_gep.ll
+++ b/llvm/test/Transforms/InstSimplify/vector_gep.ll
@@ -1,105 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 declare void @helper(<2 x ptr>)
 define void @test(<2 x ptr> %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: <2 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    call void @helper(<2 x ptr> [[A]])
+; CHECK-NEXT:    ret void
+;
   %A = getelementptr i8, <2 x ptr> %a, <2 x i32> <i32 0, i32 0>
   call void @helper(<2 x ptr> %A)
   ret void
 }
 
 define <4 x ptr> @test1(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test1(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr i8, <4 x ptr> %a, <4 x i32> zeroinitializer
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test1
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 define <4 x ptr> @test2(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test2(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr i8, <4 x ptr> %a
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test2
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 %struct = type { double, float }
 
 define <4 x ptr> @test3() {
+; CHECK-LABEL: define <4 x ptr> @test3() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %gep = getelementptr %struct, <4 x ptr> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test3
-; CHECK-NEXT: ret <4 x ptr> undef
 }
 
 %struct.empty = type { }
 
 define <4 x ptr> @test4(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test4(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr %struct.empty, <4 x ptr> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test4
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 define <4 x ptr> @test5() {
+; CHECK-LABEL: define <4 x ptr> @test5() {
+; CHECK-NEXT:    ret <4 x ptr> getelementptr (i8, <4 x ptr> <ptr inttoptr (i64 1 to ptr), ptr inttoptr (i64 2 to ptr), ptr inttoptr (i64 3 to ptr), ptr inttoptr (i64 4 to ptr)>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
+;
   %c = inttoptr <4 x i64> <i64 1, i64 2, i64 3, i64 4> to <4 x ptr>
   %gep = getelementptr i8, <4 x ptr> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test5
-; CHECK-NEXT: ret <4 x ptr> getelementptr (i8, <4 x ptr> <ptr inttoptr (i64 1 to ptr), ptr inttoptr (i64 2 to ptr), ptr inttoptr (i64 3 to ptr), ptr inttoptr (i64 4 to ptr)>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
 }
 
 @v = global [24 x [42 x [3 x i32]]] zeroinitializer, align 16
 
 define <16 x ptr> @test6() {
-; CHECK-LABEL: @test6
-; CHECK-NEXT: ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, <16 x i64> zeroinitializer)
+; CHECK-LABEL: define <16 x ptr> @test6() {
+; CHECK-NEXT:    ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, <16 x i64> zeroinitializer)
+;
   %VectorGep = getelementptr [24 x [42 x [3 x i32]]], ptr @v, i64 0, i64 0, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, i64 0
   ret <16 x ptr> %VectorGep
 }
 
 ; PR32697
-; CHECK-LABEL: tinkywinky(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @tinkywinky() {
+; CHECK-LABEL: define <4 x ptr> @tinkywinky() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, ptr undef, <4 x i64> undef
   ret <4 x ptr> %patatino
 }
 
 ; PR32697
-; CHECK-LABEL: dipsy(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @dipsy() {
+; CHECK-LABEL: define <4 x ptr> @dipsy() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, <4 x ptr> undef, <4 x i64> undef
   ret <4 x ptr> %patatino
 }
 
 ; PR32697
-; CHECK-LABEL: laalaa(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @laalaa() {
+; CHECK-LABEL: define <4 x ptr> @laalaa() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, <4 x ptr> undef, i64 undef
   ret <4 x ptr> %patatino
 }
 
 define <2 x ptr> @zero_index(ptr %p) {
-; CHECK-LABEL: @zero_index(
-; CHECK-NEXT:    %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer
-; CHECK-NEXT:    ret <2 x ptr> %gep
+; CHECK-LABEL: define <2 x ptr> @zero_index(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    ret <2 x ptr> [[GEP]]
 ;
   %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer
   ret <2 x ptr> %gep
 }
 
 define <2 x ptr> @unsized(ptr %p) {
-; CHECK-LABEL: @unsized(
-; CHECK-NEXT:    %gep = getelementptr {}, ptr %p, <2 x i64> undef
-; CHECK-NEXT:    ret <2 x ptr> %gep
+; CHECK-LABEL: define <2 x ptr> @unsized(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr {}, ptr [[P]], <2 x i64> undef
+; CHECK-NEXT:    ret <2 x ptr> [[GEP]]
 ;
   %gep = getelementptr {}, ptr %p, <2 x i64> undef
   ret <2 x ptr> %gep

From a49b5cad99ff84c2c9c55db1d5d9d4bfe1411777 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 15:25:57 +0200
Subject: [PATCH 120/230] [InferAddressSpaces] Generate test checks (NFC)

---
 .../AMDGPU/infer-address-space.ll             | 160 ++++++++++--------
 .../InferAddressSpaces/NVPTX/bug31948.ll      |  18 +-
 2 files changed, 105 insertions(+), 73 deletions(-)

diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index 72109d0cff437e..4290e4f705887f 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -1,34 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
 ; Ports of most of test/CodeGen/NVPTX/access-non-generic.ll
 
 @scalar = internal addrspace(3) global float 0.0, align 4
 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
 
-; CHECK-LABEL: @load_store_lds_f32(
-; CHECK: %tmp = load float, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @use(float %tmp)
-; CHECK: store float %v, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp2 = load float, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @use(float %tmp2)
-; CHECK: store float %v, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp3 = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
-; CHECK: call void @use(float %tmp3)
-; CHECK: store float %v, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp4 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5
-; CHECK: %tmp5 = load float, ptr addrspace(3) %tmp4, align 4
-; CHECK: call void @use(float %tmp5)
-; CHECK: store float %v, ptr addrspace(3) %tmp4, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp7 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 %i
-; CHECK: %tmp8 = load float, ptr addrspace(3) %tmp7, align 4
-; CHECK: call void @use(float %tmp8)
-; CHECK: store float %v, ptr addrspace(3) %tmp7, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: ret void
 define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @load_store_lds_f32(
+; CHECK-SAME: i32 [[I:%.*]], float [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @use(float [[TMP]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @use(float [[TMP2]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
+; CHECK-NEXT:    call void @use(float [[TMP3]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(3) [[TMP4]], align 4
+; CHECK-NEXT:    call void @use(float [[TMP5]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) [[TMP4]], align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 [[I]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT:    call void @use(float [[TMP8]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   call void @use(float %tmp)
@@ -57,20 +61,27 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: @constexpr_load_int_from_float_lds(
-; CHECK: %tmp = load i32, ptr addrspace(3) @scalar, align 4
 define i32 @constexpr_load_int_from_float_lds() #0 {
+; CHECK-LABEL: define i32 @constexpr_load_int_from_float_lds(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    ret i32 [[TMP]]
+;
 bb:
   %tmp = load i32, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   ret i32 %tmp
 }
 
-; CHECK-LABEL: @load_int_from_global_float(
-; CHECK: %tmp1 = getelementptr float, ptr addrspace(1) %input, i32 %i
-; CHECK: %tmp2 = getelementptr float, ptr addrspace(1) %tmp1, i32 %j
-; CHECK: %tmp4 = load i32, ptr addrspace(1) %tmp2
-; CHECK: ret i32 %tmp4
 define i32 @load_int_from_global_float(ptr addrspace(1) %input, i32 %i, i32 %j) #0 {
+; CHECK-LABEL: define i32 @load_int_from_global_float(
+; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i32 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i32 [[J]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
 bb:
   %tmp = addrspacecast ptr addrspace(1) %input to ptr
   %tmp1 = getelementptr float, ptr %tmp, i32 %i
@@ -79,20 +90,26 @@ bb:
   ret i32 %tmp4
 }
 
-; CHECK-LABEL: @nested_const_expr(
-; CHECK: store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4
 define amdgpu_kernel void @nested_const_expr() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @nested_const_expr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
   store i32 1, ptr bitcast (ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1) to ptr), align 4
 
   ret void
 }
 
-; CHECK-LABEL: @rauw(
-; CHECK: %addr = getelementptr float, ptr addrspace(1) %input, i64 10
-; CHECK-NEXT: %v = load float, ptr addrspace(1) %addr
-; CHECK-NEXT: store float %v, ptr addrspace(1) %addr
-; CHECK-NEXT: ret void
 define amdgpu_kernel void @rauw(ptr addrspace(1) %input) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @rauw(
+; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 10
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(1) [[ADDR]], align 4
+; CHECK-NEXT:    store float [[V]], ptr addrspace(1) [[ADDR]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
   %generic_input = addrspacecast ptr addrspace(1) %input to ptr
   %addr = getelementptr float, ptr %generic_input, i64 10
@@ -102,20 +119,22 @@ bb:
 }
 
 ; FIXME: Should be able to eliminate the cast inside the loop
-; CHECK-LABEL: @loop(
-
-; CHECK: %end = getelementptr float, ptr addrspace(3) @array, i64 10
-; CHECK: br label %loop
-
-; CHECK: loop:                                             ; preds = %loop, %entry
-; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ]
-; CHECK: %v = load float, ptr addrspace(3) %i
-; CHECK: call void @use(float %v)
-; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1
-; CHECK: %exit_cond = icmp eq ptr addrspace(3) %i2, %end
-
-; CHECK: br i1 %exit_cond, label %exit, label %loop
 define amdgpu_kernel void @loop() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @loop(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END:%.*]] = getelementptr float, ptr addrspace(3) @array, i64 10
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4
+; CHECK-NEXT:    call void @use(float [[V]])
+; CHECK-NEXT:    [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr addrspace(3) [[I2]], [[END]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %p = addrspacecast ptr addrspace(3) @array to ptr
   %end = getelementptr float, ptr %p, i64 10
@@ -135,19 +154,23 @@ exit:                                             ; preds = %loop
 
 @generic_end = external addrspace(1) global ptr
 
-; CHECK-LABEL: @loop_with_generic_bound(
-; CHECK: %end = load ptr, ptr addrspace(1) @generic_end
-; CHECK: br label %loop
-
-; CHECK: loop:
-; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ]
-; CHECK: %v = load float, ptr addrspace(3) %i
-; CHECK: call void @use(float %v)
-; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1
-; CHECK: %0 = addrspacecast ptr addrspace(3) %i2 to ptr
-; CHECK: %exit_cond = icmp eq ptr %0, %end
-; CHECK: br i1 %exit_cond, label %exit, label %loop
 define amdgpu_kernel void @loop_with_generic_bound() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @loop_with_generic_bound(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END:%.*]] = load ptr, ptr addrspace(1) @generic_end, align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4
+; CHECK-NEXT:    call void @use(float [[V]])
+; CHECK-NEXT:    [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[END]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %p = addrspacecast ptr addrspace(3) @array to ptr
   %end = load ptr, ptr addrspace(1) @generic_end
@@ -165,11 +188,14 @@ exit:                                             ; preds = %loop
   ret void
 }
 
-; CHECK-LABEL: @select_bug(
-; CHECK: %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
-; CHECK: %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel
-; CHECK: %cmp169 = icmp uge ptr undef, %add.ptr157
 define void @select_bug() #0 {
+; CHECK-LABEL: define void @select_bug(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
+; CHECK-NEXT:    [[ADD_PTR157:%.*]] = getelementptr inbounds i64, ptr undef, i64 [[SEL]]
+; CHECK-NEXT:    [[CMP169:%.*]] = icmp uge ptr undef, [[ADD_PTR157]]
+; CHECK-NEXT:    unreachable
+;
   %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
   %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel
   %cmp169 = icmp uge ptr undef, %add.ptr157
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
index e6b517a73fa463..23c5f99e5d0865 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -passes=infer-address-spaces %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
@@ -6,18 +7,23 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
 @var1 = local_unnamed_addr addrspace(3) externally_initialized global %struct.bar undef, align 8
 
-; CHECK-LABEL: @bug31948(
-; CHECK: %tmp = load ptr, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 8
-; CHECK: %tmp1 = load float, ptr %tmp, align 4
-; CHECK: store float %conv1, ptr %tmp, align 4
-; CHECK: store i32 32, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 4
 define void @bug31948(float %a, ptr nocapture readnone %x, ptr nocapture readnone %y) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @bug31948(
+; CHECK-SAME: float [[A:%.*]], ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR:%.*]], ptr addrspace(3) @var1, i64 0, i32 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP]], align 4
+; CHECK-NEXT:    [[CONV1:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    store float [[CONV1]], ptr [[TMP]], align 4
+; CHECK-NEXT:    store i32 32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR]], ptr addrspace(3) @var1, i64 0, i32 1), align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp = load ptr, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 8
   %tmp1 = load float, ptr %tmp, align 4
   %conv1 = fadd float %tmp1, 1.000000e+00
   store float %conv1, ptr %tmp, align 4
-  store i32 32, ptr bitcast (ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1) to ptr), align 4
+  store i32 32, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 4
   ret void
 }
 

From 9377412c5a8bbfbee93029ef22b4b74949cbe1b5 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 29 May 2024 08:18:57 -0500
Subject: [PATCH 121/230] [clang][OpenMP] Remove unused include of
 UniqueVector.h, NFC

---
 clang/lib/Parse/ParseOpenMP.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e959dd6378f46b..cd8df3332724f0 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -25,7 +25,6 @@
 #include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/UniqueVector.h"
 #include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 #include <optional>

From 1ea8caeada6efa991f7221f95fc6df581845895d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Wed, 29 May 2024 14:32:22 +0100
Subject: [PATCH 122/230] [AArch64] Add patterns for conversions using
 fixed-point scvtf (#92922)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  25 +++++
 .../AArch64/fixed-point-conv-vec-pat.ll       | 104 ++++++++++++++++++
 3 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 814bbe27049820..3e2a5bfbc2321c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14365,7 +14365,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, DL, MVT::i32));
+                         DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
     }
 
     // Right shift register.  Note, there is not a shift right register
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4830033b23527c..dd54520c8ddadd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -733,6 +733,12 @@ def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
 def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
 
 def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+
+def AArch64vashr_exact : PatFrag<(ops          node:$lhs, node:$rhs),
+                                 (AArch64vashr node:$lhs, node:$rhs), [{
+  return N->getFlags().hasExact();
+}]>;
+
 def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
 def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
 def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
@@ -7710,6 +7716,25 @@ defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>;
 defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
 
+let Predicates = [HasNEON] in {
+def : Pat<(v2f32 (sint_to_fp (v2i32 (AArch64vashr_exact v2i32:$Vn, i32:$shift)))),
+          (SCVTFv2i32_shift $Vn, vecshiftR32:$shift)>;
+
+def : Pat<(v4f32 (sint_to_fp (v4i32 (AArch64vashr_exact v4i32:$Vn, i32:$shift)))),
+          (SCVTFv4i32_shift $Vn, vecshiftR32:$shift)>;
+
+def : Pat<(v2f64 (sint_to_fp (v2i64 (AArch64vashr_exact v2i64:$Vn, i32:$shift)))),
+          (SCVTFv2i64_shift $Vn, vecshiftR64:$shift)>;
+}
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(v4f16 (sint_to_fp (v4i16 (AArch64vashr_exact v4i16:$Vn, i32:$shift)))),
+          (SCVTFv4i16_shift $Vn, vecshiftR16:$shift)>;
+
+def : Pat<(v8f16 (sint_to_fp (v8i16 (AArch64vashr_exact v8i16:$Vn, i32:$shift)))),
+          (SCVTFv8i16_shift $Vn, vecshiftR16:$shift)>;
+}
+
 // X << 1 ==> X + X
 class SHLToADDPat<ValueType ty, RegisterClass regtype>
   : Pat<(ty (AArch64vshl (ty regtype:$Rn), (i32 1))),
diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
new file mode 100644
index 00000000000000..dff216192a6c3c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; First some corner cases
+define <4 x float> @f_v4_s0(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 0, i32 0, i32 0, i32 0>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s1(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s, #1
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 1, i32 1, i32 1, i32 1>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s24_inexact(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s24_inexact:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr <4 x i32> %u, <i32 24, i32 24, i32 24, i32 24>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s31(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr <4 x i32> %u, <i32 31, i32 31, i32 31, i32 31>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+; Common cases for conversion from signed integer to floating point types
+define <2 x float> @f_v2_s24(<2 x i32> %u) {
+; CHECK-LABEL: f_v2_s24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2s, v0.2s, #24
+; CHECK-NEXT:    ret
+  %s = ashr exact <2 x i32> %u, <i32 24, i32 24>
+  %v = sitofp <2 x i32> %s to <2 x float>
+  ret <2 x float> %v
+}
+
+define <4 x float> @f_v4_s24(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s, #24
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 24, i32 24, i32 24, i32 24>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+; Check legalisation to <2 x f64> does not get in the way
+define <8 x double> @d_v8_s64(<8 x i64> %u) {
+; CHECK-LABEL: d_v8_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d, #56
+; CHECK-NEXT:    scvtf v1.2d, v1.2d, #56
+; CHECK-NEXT:    scvtf v2.2d, v2.2d, #56
+; CHECK-NEXT:    scvtf v3.2d, v3.2d, #56
+; CHECK-NEXT:    ret
+  %s = ashr exact <8 x i64> %u, <i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56>
+  %v = sitofp <8 x i64> %s to <8 x double>
+  ret <8 x double> %v
+}
+
+define <4 x half> @h_v4_s8(<4 x i16> %u) #0 {
+; CHECK-LABEL: h_v4_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4h, v0.4h, #8
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i16> %u, <i16 8, i16 8, i16 8, i16 8>
+  %v = sitofp <4 x i16> %s to <4 x half>
+  ret <4 x half> %v
+}
+
+define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
+; CHECK-LABEL: h_v8_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.8h, v0.8h, #8
+; CHECK-NEXT:    ret
+  %s = ashr exact <8 x i16> %u, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %v = sitofp <8 x i16> %s to <8 x half>
+  ret <8 x half> %v
+}
+
+attributes #0 = { "target-features"="+fullfp16"}

From 0dfd2bf4dfd3fc8c0733678186ceb37776597d35 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 15:36:08 +0200
Subject: [PATCH 123/230] [LTT] Directly create inbounds gep (NFCI)

We know that this gep is inbounds. Constant expression construction
already infers this fact, but make it explicit.
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 633fcb3314c42f..f86f217bca5886 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -879,7 +879,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
     // Multiply by 2 to account for padding elements.
     Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
                                       ConstantInt::get(Int32Ty, I * 2)};
-    Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr(
         NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
     assert(GV->getType()->getAddressSpace() == 0);
     GlobalAlias *GAlias =

From 180448b13c2bfc94f4eef64d2352ad4cf94f01c7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 29 May 2024 14:40:08 +0100
Subject: [PATCH 124/230] [AMDGPU] Reduce use of continue in SIWholeQuadMode.
 NFC. (#93659)

---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 92 +++++++++-------------
 1 file changed, 36 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ea8109bbee9aed..09dc1c781e2f30 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -278,11 +278,10 @@ LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 
     for (const MachineInstr &MI : *BII.first) {
       auto III = Instructions.find(&MI);
-      if (III == Instructions.end())
-        continue;
-
-      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
-             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      if (III != Instructions.end()) {
+        dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
+               << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      }
     }
   }
 }
@@ -455,10 +454,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
       LiveRange &LR = LIS->getRegUnit(Unit);
       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
-      if (!Value)
-        continue;
-
-      markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+      if (Value)
+        markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
     }
   }
 }
@@ -499,19 +496,16 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
       if (TII->isWQM(Opcode)) {
         // If LOD is not supported WQM is not needed.
-        if (!ST->hasExtendedImageInsts())
-          continue;
         // Only generate implicit WQM if implicit derivatives are required.
         // This avoids inserting unintended WQM if a shader type without
         // implicit derivatives uses an image sampling instruction.
-        if (!HasImplicitDerivatives)
-          continue;
-        // Sampling instructions don't need to produce results for all pixels
-        // in a quad, they just require all inputs of a quad to have been
-        // computed for derivatives.
-        markInstructionUses(MI, StateWQM, Worklist);
-        GlobalFlags |= StateWQM;
-        continue;
+        if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
+          // Sampling instructions don't need to produce results for all pixels
+          // in a quad, they just require all inputs of a quad to have been
+          // computed for derivatives.
+          markInstructionUses(MI, StateWQM, Worklist);
+          GlobalFlags |= StateWQM;
+        }
       } else if (Opcode == AMDGPU::WQM) {
         // The WQM intrinsic requires its output to have all the helper lanes
         // correct, so we need it to be in WQM.
@@ -520,7 +514,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       } else if (Opcode == AMDGPU::SOFT_WQM) {
         LowerToCopyInstrs.push_back(&MI);
         SoftWQMInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WWM) {
         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
         // it needs to be executed in WQM or Exact so that its copy doesn't
@@ -528,7 +521,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         markInstructionUses(MI, StateStrictWWM, Worklist);
         GlobalFlags |= StateStrictWWM;
         LowerToMovInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WQM ||
                  TII->isDualSourceBlendEXP(MI)) {
         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
@@ -551,7 +543,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           GlobalFlags |= StateExact;
           III.Disabled = StateWQM | StateStrict;
         }
-        continue;
       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
                  Opcode == AMDGPU::DS_PARAM_LOAD ||
                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
@@ -561,7 +552,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         InstrInfo &II = Instructions[&MI];
         II.Needs |= StateStrictWQM;
         GlobalFlags |= StateStrictWQM;
-        continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
         III.Disabled = StateStrict;
@@ -574,7 +564,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
-        continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -583,40 +572,33 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateStrict;
-        continue;
-      } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
-          LiveMaskQueries.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
-                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
-                   Opcode == AMDGPU::SI_DEMOTE_I1) {
-          KillInstrs.push_back(&MI);
-          BBI.NeedsLowering = true;
-        } else if (WQMOutputs) {
-          // The function is in machine SSA form, which means that physical
-          // VGPRs correspond to shader inputs and outputs. Inputs are
-          // only used, outputs are only defined.
-          // FIXME: is this still valid?
-          for (const MachineOperand &MO : MI.defs()) {
-            if (!MO.isReg())
-              continue;
-
-            Register Reg = MO.getReg();
-
-            if (!Reg.isVirtual() &&
-                TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
-              Flags = StateWQM;
-              break;
-            }
+      } else if (Opcode == AMDGPU::SI_PS_LIVE ||
+                 Opcode == AMDGPU::SI_LIVE_MASK) {
+        LiveMaskQueries.push_back(&MI);
+      } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+                 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+                 Opcode == AMDGPU::SI_DEMOTE_I1) {
+        KillInstrs.push_back(&MI);
+        BBI.NeedsLowering = true;
+      } else if (WQMOutputs) {
+        // The function is in machine SSA form, which means that physical
+        // VGPRs correspond to shader inputs and outputs. Inputs are
+        // only used, outputs are only defined.
+        // FIXME: is this still valid?
+        for (const MachineOperand &MO : MI.defs()) {
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical() &&
+              TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
+            Flags = StateWQM;
+            break;
           }
         }
-
-        if (!Flags)
-          continue;
       }
 
-      markInstruction(MI, Flags, Worklist);
-      GlobalFlags |= Flags;
+      if (Flags) {
+        markInstruction(MI, Flags, Worklist);
+        GlobalFlags |= Flags;
+      }
     }
   }
 
@@ -1568,8 +1550,6 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(*MBB, *MI);
       break;
-    default:
-      continue;
     }
     if (SplitPoint)
       splitBlock(MBB, SplitPoint);

From 24ddce62c8bb92a19ba3677629c77a2e6f137b1a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 29 May 2024 21:42:08 +0800
Subject: [PATCH 125/230] [GISel] Legalize bitreverse with types smaller than 8
 bits (#92998)

This patch adds support for lowering `bitreverse` with types smaller
than 8 bits. It also fixes an existing assertion failure in
`llvm::APInt::getSplat`: https://godbolt.org/z/7crs8xrcG

The lowering logic is copied from SDAG:

https://github.com/llvm/llvm-project/blob/2034f2fc8729bd4645ef7caa3c5c6efa284d2d3f/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp#L9384-L9398
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  64 ++--
 .../test/CodeGen/RISCV/GlobalISel/bitmanip.ll | 207 ++++++++++++
 .../legalizer/legalize-bitreverse-rv32.mir    | 276 +++++++++++++++-
 .../legalizer/legalize-bitreverse-rv64.mir    | 303 +++++++++++++++++-
 4 files changed, 828 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d8b0f52ecf9e32..9208b096affad9 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7977,27 +7977,51 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
   auto [Dst, Src] = MI.getFirst2Regs();
   const LLT Ty = MRI.getType(Src);
-  unsigned Size = Ty.getSizeInBits();
+  unsigned Size = Ty.getScalarSizeInBits();
+
+  if (Size >= 8) {
+    MachineInstrBuilder BSWAP =
+        MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
+
+    // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
+    //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
+    // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
+    MachineInstrBuilder Swap4 =
+        SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
+
+    // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
+    //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
+    // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
+    MachineInstrBuilder Swap2 =
+        SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
+
+    // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
+    // 6|7
+    //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
+    // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
+    SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+  } else {
+    // Expand bitreverse for types smaller than 8 bits.
+    MachineInstrBuilder Tmp;
+    for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
+      MachineInstrBuilder Tmp2;
+      if (I < J) {
+        auto ShAmt = MIRBuilder.buildConstant(Ty, J - I);
+        Tmp2 = MIRBuilder.buildShl(Ty, Src, ShAmt);
+      } else {
+        auto ShAmt = MIRBuilder.buildConstant(Ty, I - J);
+        Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt);
+      }
 
-  MachineInstrBuilder BSWAP =
-      MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
-
-  // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
-  //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
-  // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
-  MachineInstrBuilder Swap4 =
-      SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
-
-  // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
-  //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
-  // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
-  MachineInstrBuilder Swap2 =
-      SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
-
-  // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
-  //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
-  // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
-  SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+      auto Mask = MIRBuilder.buildConstant(Ty, 1U << J);
+      Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask);
+      if (I == 0)
+        Tmp = Tmp2;
+      else
+        Tmp = MIRBuilder.buildOr(Ty, Tmp, Tmp2);
+    }
+    MIRBuilder.buildCopy(Dst, Tmp);
+  }
 
   MI.eraseFromParent();
   return Legalized;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
new file mode 100644
index 00000000000000..5c42fefb95b39f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV64
+
+define i2 @bitreverse_i2(i2 %x) {
+; RV32-LABEL: bitreverse_i2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    andi a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    andi a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    srliw a0, a0, 1
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i2 @llvm.bitreverse.i2(i2 %x)
+  ret i2 %rev
+}
+
+define i3 @bitreverse_i3(i3 %x) {
+; RV32-LABEL: bitreverse_i3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 2
+; RV32-NEXT:    andi a1, a1, 4
+; RV32-NEXT:    andi a0, a0, 7
+; RV32-NEXT:    andi a2, a0, 2
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 2
+; RV64-NEXT:    andi a1, a1, 4
+; RV64-NEXT:    andi a0, a0, 7
+; RV64-NEXT:    andi a2, a0, 2
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i3 @llvm.bitreverse.i3(i3 %x)
+  ret i3 %rev
+}
+
+define i4 @bitreverse_i4(i4 %x) {
+; RV32-LABEL: bitreverse_i4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 3
+; RV32-NEXT:    andi a1, a1, 8
+; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    andi a2, a2, 4
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    andi a0, a0, 15
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    andi a2, a2, 2
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    andi a1, a1, 8
+; RV64-NEXT:    slli a2, a0, 1
+; RV64-NEXT:    andi a2, a2, 4
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    andi a0, a0, 15
+; RV64-NEXT:    srliw a2, a0, 1
+; RV64-NEXT:    andi a2, a2, 2
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i4 @llvm.bitreverse.i4(i4 %x)
+  ret i4 %rev
+}
+
+define i7 @bitreverse_i7(i7 %x) {
+; RV32-LABEL: bitreverse_i7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 6
+; RV32-NEXT:    andi a1, a1, 64
+; RV32-NEXT:    slli a2, a0, 4
+; RV32-NEXT:    andi a2, a2, 32
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    slli a2, a0, 2
+; RV32-NEXT:    andi a2, a2, 16
+; RV32-NEXT:    andi a0, a0, 127
+; RV32-NEXT:    andi a3, a0, 8
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    andi a2, a2, 4
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    andi a3, a3, 2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 6
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 6
+; RV64-NEXT:    andi a1, a1, 64
+; RV64-NEXT:    slli a2, a0, 4
+; RV64-NEXT:    andi a2, a2, 32
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    slli a2, a0, 2
+; RV64-NEXT:    andi a2, a2, 16
+; RV64-NEXT:    andi a0, a0, 127
+; RV64-NEXT:    andi a3, a0, 8
+; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a2, a0, 2
+; RV64-NEXT:    andi a2, a2, 4
+; RV64-NEXT:    srliw a3, a0, 4
+; RV64-NEXT:    andi a3, a3, 2
+; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 6
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i7 @llvm.bitreverse.i7(i7 %x)
+  ret i7 %rev
+}
+
+define i24 @bitreverse_i24(i24 %x) {
+; RV32-LABEL: bitreverse_i24:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 16
+; RV32-NEXT:    lui a2, 4096
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    lui a1, 1048335
+; RV32-NEXT:    addi a1, a1, 240
+; RV32-NEXT:    and a3, a1, a2
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    srli a3, a3, 4
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    lui a1, 1047757
+; RV32-NEXT:    addi a1, a1, -820
+; RV32-NEXT:    and a3, a1, a2
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    lui a1, 1047211
+; RV32-NEXT:    addi a1, a1, -1366
+; RV32-NEXT:    and a2, a1, a2
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i24:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 16
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    srliw a0, a0, 16
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    lui a1, 1048335
+; RV64-NEXT:    addi a1, a1, 240
+; RV64-NEXT:    and a3, a1, a2
+; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    srliw a3, a3, 4
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    lui a1, 1047757
+; RV64-NEXT:    addi a1, a1, -820
+; RV64-NEXT:    and a3, a1, a2
+; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    srliw a3, a3, 2
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    lui a1, 1047211
+; RV64-NEXT:    addiw a1, a1, -1366
+; RV64-NEXT:    and a2, a1, a2
+; RV64-NEXT:    and a2, a0, a2
+; RV64-NEXT:    srliw a2, a2, 1
+; RV64-NEXT:    slliw a0, a0, 1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ret
+  %rev = call i24 @llvm.bitreverse.i24(i24 %x)
+  ret i24 %rev
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
index 5044514babe54a..7625a5c2d568a9 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv32 -mattr=+v -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            bitreverse_i8
@@ -248,3 +248,277 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            bitreverse_i2
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: $x10 = COPY [[OR]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s2) = G_TRUNC %1(s32)
+    %2:_(s2) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s2)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i3
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i3
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C6]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C5]](s32)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]]
+    ; CHECK-NEXT: $x10 = COPY [[OR1]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s3) = G_TRUNC %1(s32)
+    %2:_(s3) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s3)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i4
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C5]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C8]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C7]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]]
+    ; CHECK-NEXT: $x10 = COPY [[OR2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s4) = G_TRUNC %1(s32)
+    %2:_(s4) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s4)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i7
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i7
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C4]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C7]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s32)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C10]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s32)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C13]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C12]](s32)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]]
+    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C16]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C15]](s32)
+    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]]
+    ; CHECK-NEXT: $x10 = COPY [[OR5]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s7) = G_TRUNC %1(s32)
+    %2:_(s7) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s7)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i24
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i24
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C7]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C5]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C5]](s32)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C6]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C9]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C10]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C8]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C8]](s32)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C9]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]]
+    ; CHECK-NEXT: $x10 = COPY [[OR3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s24) = G_TRUNC %1(s32)
+    %2:_(s24) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s24)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_v2i4
+body:             |
+  bb.1.entry:
+
+    ; CHECK-LABEL: name: bitreverse_v2i4
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>)
+    ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<2 x s32>) = COPY $v8
+    %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>)
+    %2:_(<2 x s4>) = G_BITREVERSE %0
+    %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>)
+    $v8 = COPY %3(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
index d1473504651668..71583f15cd5cd1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -global-isel-abort=0 -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            bitreverse_i8
@@ -251,3 +251,304 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            bitreverse_i2
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s2) = G_TRUNC %1(s64)
+    %2:_(s2) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s2)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i3
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i3
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s3) = G_TRUNC %1(s64)
+    %2:_(s3) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s3)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i4
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C4]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C8]](s64)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s4) = G_TRUNC %1(s64)
+    %2:_(s4) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s4)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i7
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i7
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[TRUNC2]], [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C7]](s64)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]]
+    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[TRUNC4]], [[C9]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C10]](s64)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]]
+    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[TRUNC5]], [[C12]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C13]](s64)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]]
+    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[TRUNC6]], [[C15]]
+    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C16]](s64)
+    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s7) = G_TRUNC %1(s64)
+    %2:_(s7) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s7)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i24
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i24
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C2]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C6]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s64)
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C10]](s64)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C11]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C12]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C13]](s64)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C14]](s64)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s24) = G_TRUNC %1(s64)
+    %2:_(s24) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s24)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_v2i4
+body:             |
+  bb.1.entry:
+
+    ; CHECK-LABEL: name: bitreverse_v2i4
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>)
+    ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<2 x s32>) = COPY $v8
+    %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>)
+    %2:_(<2 x s4>) = G_BITREVERSE %0
+    %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>)
+    $v8 = COPY %3(<2 x s32>)
+    PseudoRET implicit $v8
+
+...

From fba84ecc158ec4a9e0eae91d923d4a8f15e7ed6f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 15:42:09 +0200
Subject: [PATCH 126/230] [WPD] Directly create geteleementptr inbounds (NFCI)

We know that this GEP is inbounds, so make it explicit. NFCI
because constant expression construction already infers this.
---
 llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index e7a188e9431db5..9929ebb96dcafe 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1927,7 +1927,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
   // element (the original initializer).
   auto Alias = GlobalAlias::create(
       B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
-      ConstantExpr::getGetElementPtr(
+      ConstantExpr::getInBoundsGetElementPtr(
           NewInit->getType(), NewGV,
           ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
                                ConstantInt::get(Int32Ty, 1)}),

From 886d31675dbb6fe8cf97fd9083870bd043ce9f02 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Wed, 29 May 2024 13:18:01 +0000
Subject: [PATCH 127/230] [AArch64][NFC] Pre-commit test for Push ADD/SUB
 through {S|Z}EXT (#90964)

---
 .../AArch64/GlobalISel/combine-add.mir        |  119 ++
 llvm/test/CodeGen/AArch64/neon-extadd.ll      | 1785 +++++++++++++----
 2 files changed, 1514 insertions(+), 390 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
index fad3655da9d013..78411f34bebd31 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
@@ -207,3 +207,122 @@ body:             |
     %3:_(<4 x s32>) = G_FADD %0, %2(<4 x s32>)
     $q0 = COPY %3(<4 x s32>)
 ...
+---
+name:            saddl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: saddl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_ADD %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            uaddl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: uaddl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_ADD %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            ssubl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: ssubl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_SUB %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            usubl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: usubl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_SUB %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 16200435c5c31d..6aa9c394a8fd1f 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple aarch64 -o - -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @extadds_v8i8_i16(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: extadds_v8i8_i16:
@@ -26,12 +27,19 @@ entry:
 }
 
 define <16 x i16> @extadds_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extadds_v16i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i16>
   %s1s = sext <16 x i8> %s1 to <16 x i16>
@@ -40,12 +48,19 @@ entry:
 }
 
 define <16 x i16> @extaddu_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extaddu_v16i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i16>
   %s1s = zext <16 x i8> %s1 to <16 x i16>
@@ -54,16 +69,26 @@ entry:
 }
 
 define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
-; CHECK-LABEL: extadds_v32i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
-; CHECK-NEXT:    saddl v5.8h, v0.8b, v2.8b
-; CHECK-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
-; CHECK-NEXT:    saddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v32i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
+; CHECK-SD-NEXT:    saddl v5.8h, v0.8b, v2.8b
+; CHECK-SD-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    saddl v2.8h, v1.8b, v3.8b
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v32i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    saddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    saddl v2.8h, v1.8b, v3.8b
+; CHECK-GI-NEXT:    saddl2 v3.8h, v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <32 x i8> %s0 to <32 x i16>
   %s1s = sext <32 x i8> %s1 to <32 x i16>
@@ -72,16 +97,26 @@ entry:
 }
 
 define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
-; CHECK-LABEL: extaddu_v32i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
-; CHECK-NEXT:    uaddl v5.8h, v0.8b, v2.8b
-; CHECK-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
-; CHECK-NEXT:    uaddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v32i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uaddl v5.8h, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    uaddl v2.8h, v1.8b, v3.8b
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v32i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    uaddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uaddl v2.8h, v1.8b, v3.8b
+; CHECK-GI-NEXT:    uaddl2 v3.8h, v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <32 x i8> %s0 to <32 x i16>
   %s1s = zext <32 x i8> %s1 to <32 x i16>
@@ -90,12 +125,20 @@ entry:
 }
 
 define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extadds_v8i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    saddl v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    saddl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i32>
   %s1s = sext <8 x i8> %s1 to <8 x i32>
@@ -104,12 +147,20 @@ entry:
 }
 
 define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extaddu_v8i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    uaddl v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    uaddl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i32>
   %s1s = zext <8 x i8> %s1 to <8 x i32>
@@ -117,16 +168,72 @@ entry:
   ret <8 x i32> %m
 }
 
+define <8 x i32> @extsubs_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = sext <8 x i8> %s1 to <8 x i32>
+  %m = sub <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @extsubu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = sub <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
 define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extadds_v16i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    saddl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    saddl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    saddl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    saddl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    saddl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -135,15 +242,27 @@ entry:
 }
 
 define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extaddu_v16i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    uaddl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ushll v0.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    uaddl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    uaddl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    uaddl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    uaddl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = zext <16 x i8> %s1 to <16 x i32>
@@ -151,17 +270,89 @@ entry:
   ret <16 x i32> %m
 }
 
+define <16 x i32> @extsubs_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    ssubl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    ssubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <16 x i8> %s0 to <16 x i32>
+  %s1s = sext <16 x i8> %s1 to <16 x i32>
+  %m = sub <16 x i32> %s0s, %s1s
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @extsubu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    usubl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    usubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <16 x i8> %s0 to <16 x i32>
+  %s1s = zext <16 x i8> %s1 to <16 x i32>
+  %m = sub <16 x i32> %s0s, %s1s
+  ret <16 x i32> %m
+}
+
 define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extadds_v8i8_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.2d, v1.2s, #0
-; CHECK-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
   %s1s = sext <8 x i8> %s1 to <8 x i64>
@@ -170,16 +361,30 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extaddu_v8i8_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.2d, v1.2s, #0
-; CHECK-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i64>
   %s1s = zext <8 x i8> %s1 to <8 x i64>
@@ -187,6 +392,430 @@ entry:
   ret <8 x i64> %m
 }
 
+define <8 x i64> @extsubs_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    ssubl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    ssubl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    ssubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = sext <8 x i8> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extsubu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    usubl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    usubl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    usubl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    usubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <16 x i64> @extaddu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extaddu_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT:    uaddl v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT:    uaddl v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    uaddl2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    uaddl v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    uaddl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i8> %a to <16 x i64>
+    %d = zext <16 x i8> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extadds_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extadds_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT:    saddl v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT:    saddl v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    saddl2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    saddl v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    saddl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i8> %a to <16 x i64>
+    %d = sext <16 x i8> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extsubu_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT:    usubl v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    usubl2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT:    usubl v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT:    usubl2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT:    usubl v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    usubl2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    usubl v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    usubl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i8> %a to <16 x i64>
+    %d = zext <16 x i8> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubs_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extsubs_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubl v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    ssubl2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT:    ssubl v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT:    ssubl2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT:    ssubl v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    ssubl2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    ssubl v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    ssubl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i8> %a to <16 x i64>
+    %d = sext <16 x i8> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extaddu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extaddu_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    uaddl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uaddl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v3.8h, #0
+; CHECK-GI-NEXT:    uaddl v0.2d, v4.2s, v6.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v4.4s, v6.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v5.2s, v16.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v5.4s, v16.4s
+; CHECK-GI-NEXT:    uaddl v4.2d, v7.2s, v17.2s
+; CHECK-GI-NEXT:    uaddl2 v5.2d, v7.4s, v17.4s
+; CHECK-GI-NEXT:    uaddl v6.2d, v18.2s, v19.2s
+; CHECK-GI-NEXT:    uaddl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i16> %a to <16 x i64>
+    %d = zext <16 x i16> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extadds_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extadds_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saddl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    saddl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    saddl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v18.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll2 v19.4s, v3.8h, #0
+; CHECK-GI-NEXT:    saddl v0.2d, v4.2s, v6.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v4.4s, v6.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v5.2s, v16.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v5.4s, v16.4s
+; CHECK-GI-NEXT:    saddl v4.2d, v7.2s, v17.2s
+; CHECK-GI-NEXT:    saddl2 v5.2d, v7.4s, v17.4s
+; CHECK-GI-NEXT:    saddl v6.2d, v18.2s, v19.2s
+; CHECK-GI-NEXT:    saddl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i16> %a to <16 x i64>
+    %d = sext <16 x i16> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extsubu_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    usubl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    usubl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    usubl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    usubl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v3.8h, #0
+; CHECK-GI-NEXT:    usubl v0.2d, v4.2s, v6.2s
+; CHECK-GI-NEXT:    usubl2 v1.2d, v4.4s, v6.4s
+; CHECK-GI-NEXT:    usubl v2.2d, v5.2s, v16.2s
+; CHECK-GI-NEXT:    usubl2 v3.2d, v5.4s, v16.4s
+; CHECK-GI-NEXT:    usubl v4.2d, v7.2s, v17.2s
+; CHECK-GI-NEXT:    usubl2 v5.2d, v7.4s, v17.4s
+; CHECK-GI-NEXT:    usubl v6.2d, v18.2s, v19.2s
+; CHECK-GI-NEXT:    usubl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i16> %a to <16 x i64>
+    %d = zext <16 x i16> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubs_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extsubs_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ssubl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    ssubl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    ssubl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ssubl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v18.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll2 v19.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ssubl v0.2d, v4.2s, v6.2s
+; CHECK-GI-NEXT:    ssubl2 v1.2d, v4.4s, v6.4s
+; CHECK-GI-NEXT:    ssubl v2.2d, v5.2s, v16.2s
+; CHECK-GI-NEXT:    ssubl2 v3.2d, v5.4s, v16.4s
+; CHECK-GI-NEXT:    ssubl v4.2d, v7.2s, v17.2s
+; CHECK-GI-NEXT:    ssubl2 v5.2d, v7.4s, v17.4s
+; CHECK-GI-NEXT:    ssubl v6.2d, v18.2s, v19.2s
+; CHECK-GI-NEXT:    ssubl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i16> %a to <16 x i64>
+    %d = sext <16 x i16> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
 define <4 x i32> @extadds_v4i16_i32(<4 x i16> %s0, <4 x i16> %s1) {
 ; CHECK-LABEL: extadds_v4i16_i32:
 ; CHECK:       // %bb.0: // %entry
@@ -212,12 +841,19 @@ entry:
 }
 
 define <8 x i32> @extadds_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extadds_v8i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    saddl2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i32>
   %s1s = sext <8 x i16> %s1 to <8 x i32>
@@ -226,12 +862,19 @@ entry:
 }
 
 define <8 x i32> @extaddu_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extaddu_v8i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uaddl2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i32>
   %s1s = zext <8 x i16> %s1 to <8 x i32>
@@ -240,16 +883,26 @@ entry:
 }
 
 define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
-; CHECK-LABEL: extadds_v16i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    saddl v5.4s, v0.4h, v2.4h
-; CHECK-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
-; CHECK-NEXT:    saddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    saddl v5.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    saddl v2.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    saddl v2.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    saddl2 v3.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i16> %s0 to <16 x i32>
   %s1s = sext <16 x i16> %s1 to <16 x i32>
@@ -258,16 +911,26 @@ entry:
 }
 
 define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
-; CHECK-LABEL: extaddu_v16i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    uaddl v5.4s, v0.4h, v2.4h
-; CHECK-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
-; CHECK-NEXT:    uaddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uaddl v5.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uaddl v2.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    uaddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uaddl v2.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    uaddl2 v3.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i16> %s0 to <16 x i32>
   %s1s = zext <16 x i16> %s1 to <16 x i32>
@@ -276,12 +939,20 @@ entry:
 }
 
 define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: extadds_v4i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v4i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v4i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v1.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v1.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <4 x i16> %s0 to <4 x i64>
   %s1s = sext <4 x i16> %s1 to <4 x i64>
@@ -290,12 +961,20 @@ entry:
 }
 
 define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: extaddu_v4i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v4i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v4i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v1.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v1.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <4 x i16> %s0 to <4 x i64>
   %s1s = zext <4 x i16> %s1 to <4 x i64>
@@ -304,15 +983,27 @@ entry:
 }
 
 define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extadds_v8i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v2.4s, v0.4h, v1.4h
-; CHECK-NEXT:    saddl2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT:    sshll v0.2d, v2.2s, #0
-; CHECK-NEXT:    sshll2 v3.2d, v4.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v2.4s, #0
-; CHECK-NEXT:    sshll v2.2d, v4.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    saddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i64>
   %s1s = sext <8 x i16> %s1 to <8 x i64>
@@ -321,15 +1012,27 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extaddu_v8i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v2.4s, v0.4h, v1.4h
-; CHECK-NEXT:    uaddl2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT:    ushll v0.2d, v2.2s, #0
-; CHECK-NEXT:    ushll2 v3.2d, v4.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v2.4s, #0
-; CHECK-NEXT:    ushll v2.2d, v4.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uaddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i64>
   %s1s = zext <8 x i16> %s1 to <8 x i64>
@@ -337,6 +1040,64 @@ entry:
   ret <8 x i64> %m
 }
 
+define <8 x i64> @extsubs_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ssubl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    ssubl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    ssubl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    ssubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i16> %s0 to <8 x i64>
+  %s1s = sext <8 x i16> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extsubu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    usubl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    usubl v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    usubl2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    usubl v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    usubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i16> %s0 to <8 x i64>
+  %s1s = zext <8 x i16> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
 define <2 x i64> @extadds_v2i32_i64(<2 x i32> %s0, <2 x i32> %s1) {
 ; CHECK-LABEL: extadds_v2i32_i64:
 ; CHECK:       // %bb.0: // %entry
@@ -362,12 +1123,19 @@ entry:
 }
 
 define <4 x i64> @extadds_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) {
-; CHECK-LABEL: extadds_v4i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v4i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v4i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
   %s1s = sext <4 x i32> %s1 to <4 x i64>
@@ -376,12 +1144,19 @@ entry:
 }
 
 define <4 x i64> @extaddu_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) {
-; CHECK-LABEL: extaddu_v4i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v4i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v4i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <4 x i32> %s0 to <4 x i64>
   %s1s = zext <4 x i32> %s1 to <4 x i64>
@@ -390,16 +1165,26 @@ entry:
 }
 
 define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
-; CHECK-LABEL: extadds_v8i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v5.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    saddl v5.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v2.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i32> %s0 to <8 x i64>
   %s1s = sext <8 x i32> %s1 to <8 x i64>
@@ -408,16 +1193,26 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
-; CHECK-LABEL: extaddu_v8i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v5.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uaddl v5.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v2.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i32> %s0 to <8 x i64>
   %s1s = zext <8 x i32> %s1 to <8 x i64>
@@ -426,17 +1221,33 @@ entry:
 }
 
 define <16 x i32> @add_zs(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: add_zs:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    saddw v2.8h, v2.8h, v1.8b
-; CHECK-NEXT:    saddw2 v4.8h, v0.8h, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_zs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    saddw v2.8h, v2.8h, v1.8b
+; CHECK-SD-NEXT:    saddw2 v4.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_zs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v1.8h, #0
+; CHECK-GI-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v3.8h
+; CHECK-GI-NEXT:    uaddw v2.4s, v5.4s, v4.4h
+; CHECK-GI-NEXT:    uaddw2 v3.4s, v6.4s, v4.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -445,87 +1256,174 @@ entry:
 }
 
 define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
-; CHECK-LABEL: v20:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr b2, [sp, #160]
-; CHECK-NEXT:    add x10, sp, #168
-; CHECK-NEXT:    ldr b3, [sp]
-; CHECK-NEXT:    add x11, sp, #8
-; CHECK-NEXT:    ldr b1, [sp, #96]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    mov v0.b[1], w1
-; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[1], [x9]
-; CHECK-NEXT:    add x11, sp, #16
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    add x13, sp, #184
-; CHECK-NEXT:    ld1 { v2.b }[2], [x10]
-; CHECK-NEXT:    add x12, sp, #120
-; CHECK-NEXT:    add x14, sp, #32
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    ldr b5, [sp, #64]
-; CHECK-NEXT:    mov v0.b[2], w2
-; CHECK-NEXT:    ldr b4, [sp, #224]
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v2.b }[3], [x13]
-; CHECK-NEXT:    add x13, sp, #24
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #192
-; CHECK-NEXT:    add x13, sp, #200
-; CHECK-NEXT:    add x15, sp, #80
-; CHECK-NEXT:    add x9, sp, #144
-; CHECK-NEXT:    mov v0.b[3], w3
-; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
-; CHECK-NEXT:    add x12, sp, #232
-; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
-; CHECK-NEXT:    add x14, sp, #72
-; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x14]
-; CHECK-NEXT:    add x14, sp, #40
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
-; CHECK-NEXT:    add x12, sp, #208
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    mov v0.b[4], w4
-; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
-; CHECK-NEXT:    add x14, sp, #240
-; CHECK-NEXT:    ld1 { v4.b }[2], [x14]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
-; CHECK-NEXT:    add x11, sp, #216
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
-; CHECK-NEXT:    add x12, sp, #248
-; CHECK-NEXT:    add x13, sp, #88
-; CHECK-NEXT:    mov v0.b[5], w5
-; CHECK-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    uaddl v4.8h, v5.8b, v4.8b
-; CHECK-NEXT:    mov v0.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
-; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
-; CHECK-NEXT:    mov v0.b[7], w7
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    stp q1, q3, [x8, #48]
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    stp q3, q2, [x8, #16]
-; CHECK-NEXT:    str q0, [x8]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v20:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ldr b2, [sp, #160]
+; CHECK-SD-NEXT:    add x10, sp, #168
+; CHECK-SD-NEXT:    ldr b3, [sp]
+; CHECK-SD-NEXT:    add x11, sp, #8
+; CHECK-SD-NEXT:    ldr b1, [sp, #96]
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #104
+; CHECK-SD-NEXT:    add x10, sp, #176
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x11]
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x9]
+; CHECK-SD-NEXT:    add x11, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #112
+; CHECK-SD-NEXT:    add x13, sp, #184
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x10]
+; CHECK-SD-NEXT:    add x12, sp, #120
+; CHECK-SD-NEXT:    add x14, sp, #32
+; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x11]
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT:    ldr b5, [sp, #64]
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    ldr b4, [sp, #224]
+; CHECK-SD-NEXT:    add x11, sp, #128
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #24
+; CHECK-SD-NEXT:    add x10, sp, #136
+; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #192
+; CHECK-SD-NEXT:    add x13, sp, #200
+; CHECK-SD-NEXT:    add x15, sp, #80
+; CHECK-SD-NEXT:    add x9, sp, #144
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #232
+; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #72
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x12]
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x11]
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x13]
+; CHECK-SD-NEXT:    add x12, sp, #208
+; CHECK-SD-NEXT:    add x13, sp, #48
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #240
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x14]
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x10]
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x12]
+; CHECK-SD-NEXT:    add x11, sp, #216
+; CHECK-SD-NEXT:    add x10, sp, #56
+; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x13]
+; CHECK-SD-NEXT:    add x12, sp, #248
+; CHECK-SD-NEXT:    add x13, sp, #88
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x11]
+; CHECK-SD-NEXT:    add x9, sp, #152
+; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x10]
+; CHECK-SD-NEXT:    uaddl v4.8h, v5.8b, v4.8b
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-SD-NEXT:    uaddl v2.8h, v3.8b, v2.8b
+; CHECK-SD-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    stp q1, q3, [x8, #48]
+; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    stp q3, q2, [x8, #16]
+; CHECK-SD-NEXT:    str q0, [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v20:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s4, [sp, #8]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    fmov s3, w4
+; CHECK-GI-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #40]
+; CHECK-GI-NEXT:    ldr s4, [sp, #64]
+; CHECK-GI-NEXT:    ldr s19, [sp, #72]
+; CHECK-GI-NEXT:    ldr s21, [sp, #104]
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #96]
+; CHECK-GI-NEXT:    ldr s22, [sp, #136]
+; CHECK-GI-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NEXT:    ldr s20, [sp, #48]
+; CHECK-GI-NEXT:    mov v4.s[1], v19.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v18.s[0]
+; CHECK-GI-NEXT:    ldr s18, [sp, #128]
+; CHECK-GI-NEXT:    ldr s19, [sp, #160]
+; CHECK-GI-NEXT:    ldr s24, [sp, #168]
+; CHECK-GI-NEXT:    mov v16.s[1], v21.s[0]
+; CHECK-GI-NEXT:    ldr s21, [sp, #192]
+; CHECK-GI-NEXT:    mov v18.s[1], v22.s[0]
+; CHECK-GI-NEXT:    ldr s25, [sp, #200]
+; CHECK-GI-NEXT:    ldr s22, [sp, #224]
+; CHECK-GI-NEXT:    ldr s26, [sp, #232]
+; CHECK-GI-NEXT:    ldr s23, [sp, #112]
+; CHECK-GI-NEXT:    mov v19.s[1], v24.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v20.s[0]
+; CHECK-GI-NEXT:    ldr s20, [sp, #144]
+; CHECK-GI-NEXT:    ldr s17, [sp, #80]
+; CHECK-GI-NEXT:    mov v21.s[1], v25.s[0]
+; CHECK-GI-NEXT:    mov v22.s[1], v26.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NEXT:    ldr s24, [sp, #176]
+; CHECK-GI-NEXT:    mov v16.s[2], v23.s[0]
+; CHECK-GI-NEXT:    mov v18.s[2], v20.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ldr s17, [sp, #208]
+; CHECK-GI-NEXT:    ldr s23, [sp, #240]
+; CHECK-GI-NEXT:    ldr s20, [sp, #120]
+; CHECK-GI-NEXT:    mov v19.s[2], v24.s[0]
+; CHECK-GI-NEXT:    ldr s24, [sp, #152]
+; CHECK-GI-NEXT:    ldr s5, [sp, #24]
+; CHECK-GI-NEXT:    mov v21.s[2], v17.s[0]
+; CHECK-GI-NEXT:    mov v22.s[2], v23.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    mov v16.s[3], v20.s[0]
+; CHECK-GI-NEXT:    movi v17.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    mov v3.s[3], w7
+; CHECK-GI-NEXT:    mov v18.s[3], v24.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #56]
+; CHECK-GI-NEXT:    ldr s7, [sp, #88]
+; CHECK-GI-NEXT:    ldr s25, [sp, #184]
+; CHECK-GI-NEXT:    ldr s20, [sp, #216]
+; CHECK-GI-NEXT:    mov v0.s[3], v5.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #248]
+; CHECK-GI-NEXT:    mov v19.s[3], v25.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v21.s[3], v20.s[0]
+; CHECK-GI-NEXT:    mov v22.s[3], v5.s[0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v17.16b
+; CHECK-GI-NEXT:    and v5.16b, v16.16b, v17.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v17.16b
+; CHECK-GI-NEXT:    and v6.16b, v18.16b, v17.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v17.16b
+; CHECK-GI-NEXT:    and v7.16b, v19.16b, v17.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v17.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v17.16b
+; CHECK-GI-NEXT:    and v16.16b, v21.16b, v17.16b
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    and v5.16b, v22.16b, v17.16b
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-GI-NEXT:    add v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT:    stp q1, q3, [x8]
+; CHECK-GI-NEXT:    add v1.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    stp q0, q2, [x8, #32]
+; CHECK-GI-NEXT:    str q1, [x8, #64]
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
   %s1s = zext <20 x i8> %s1 to <20 x i32>
@@ -534,98 +1432,165 @@ entry:
 }
 
 define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
-; CHECK-LABEL: i12:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -48
-; CHECK-NEXT:    ldr w13, [sp, #112]
-; CHECK-NEXT:    ldr w14, [sp, #144]
-; CHECK-NEXT:    fmov s2, w4
-; CHECK-NEXT:    ldr w17, [sp, #176]
-; CHECK-NEXT:    ldr w19, [sp, #208]
-; CHECK-NEXT:    fmov s3, w0
-; CHECK-NEXT:    ldr w20, [sp, #80]
-; CHECK-NEXT:    ldr w21, [sp, #48]
-; CHECK-NEXT:    fmov s5, w13
-; CHECK-NEXT:    fmov s4, w19
-; CHECK-NEXT:    fmov s6, w17
-; CHECK-NEXT:    fmov s7, w14
-; CHECK-NEXT:    fmov s0, w20
-; CHECK-NEXT:    fmov s1, w21
-; CHECK-NEXT:    ldr w10, [sp, #120]
-; CHECK-NEXT:    ldr w11, [sp, #152]
-; CHECK-NEXT:    ldr w12, [sp, #184]
-; CHECK-NEXT:    ldr w15, [sp, #216]
-; CHECK-NEXT:    ldr w22, [sp, #88]
-; CHECK-NEXT:    ldr w23, [sp, #56]
-; CHECK-NEXT:    mov v2.h[1], w5
-; CHECK-NEXT:    mov v3.h[1], w1
-; CHECK-NEXT:    mov v5.h[1], w10
-; CHECK-NEXT:    mov v4.h[1], w15
-; CHECK-NEXT:    mov v0.h[1], w22
-; CHECK-NEXT:    mov v1.h[1], w23
-; CHECK-NEXT:    mov v6.h[1], w12
-; CHECK-NEXT:    mov v7.h[1], w11
-; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    ldr w9, [sp, #160]
-; CHECK-NEXT:    ldr w16, [sp, #64]
-; CHECK-NEXT:    ldr w18, [sp, #96]
-; CHECK-NEXT:    ldr w10, [sp, #192]
-; CHECK-NEXT:    ldr w11, [sp, #224]
-; CHECK-NEXT:    mov v2.h[2], w6
-; CHECK-NEXT:    mov v3.h[2], w2
-; CHECK-NEXT:    mov v0.h[2], w18
-; CHECK-NEXT:    mov v1.h[2], w16
-; CHECK-NEXT:    mov v5.h[2], w8
-; CHECK-NEXT:    mov v4.h[2], w11
-; CHECK-NEXT:    mov v6.h[2], w10
-; CHECK-NEXT:    mov v7.h[2], w9
-; CHECK-NEXT:    ldr w12, [sp, #72]
-; CHECK-NEXT:    ldr w13, [sp, #104]
-; CHECK-NEXT:    ldr w8, [sp, #136]
-; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    ldr w10, [sp, #200]
-; CHECK-NEXT:    ldr w11, [sp, #232]
-; CHECK-NEXT:    mov v0.h[3], w13
-; CHECK-NEXT:    mov v1.h[3], w12
-; CHECK-NEXT:    mov v2.h[3], w7
-; CHECK-NEXT:    mov v3.h[3], w3
-; CHECK-NEXT:    mov v5.h[3], w8
-; CHECK-NEXT:    mov v4.h[3], w11
-; CHECK-NEXT:    mov v6.h[3], w10
-; CHECK-NEXT:    mov v7.h[3], w9
-; CHECK-NEXT:    movi v16.4s, #15, msl #8
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-NEXT:    and v17.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v2.16b, v5.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v4.16b, v6.16b, v16.16b
-; CHECK-NEXT:    and v5.16b, v7.16b, v16.16b
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v17.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v18.4s, v4.4s
-; CHECK-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: i12:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -48
+; CHECK-SD-NEXT:    ldr w13, [sp, #112]
+; CHECK-SD-NEXT:    ldr w14, [sp, #144]
+; CHECK-SD-NEXT:    fmov s2, w4
+; CHECK-SD-NEXT:    ldr w17, [sp, #176]
+; CHECK-SD-NEXT:    ldr w19, [sp, #208]
+; CHECK-SD-NEXT:    fmov s3, w0
+; CHECK-SD-NEXT:    ldr w20, [sp, #80]
+; CHECK-SD-NEXT:    ldr w21, [sp, #48]
+; CHECK-SD-NEXT:    fmov s5, w13
+; CHECK-SD-NEXT:    fmov s4, w19
+; CHECK-SD-NEXT:    fmov s6, w17
+; CHECK-SD-NEXT:    fmov s7, w14
+; CHECK-SD-NEXT:    fmov s0, w20
+; CHECK-SD-NEXT:    fmov s1, w21
+; CHECK-SD-NEXT:    ldr w10, [sp, #120]
+; CHECK-SD-NEXT:    ldr w11, [sp, #152]
+; CHECK-SD-NEXT:    ldr w12, [sp, #184]
+; CHECK-SD-NEXT:    ldr w15, [sp, #216]
+; CHECK-SD-NEXT:    ldr w22, [sp, #88]
+; CHECK-SD-NEXT:    ldr w23, [sp, #56]
+; CHECK-SD-NEXT:    mov v2.h[1], w5
+; CHECK-SD-NEXT:    mov v3.h[1], w1
+; CHECK-SD-NEXT:    mov v5.h[1], w10
+; CHECK-SD-NEXT:    mov v4.h[1], w15
+; CHECK-SD-NEXT:    mov v0.h[1], w22
+; CHECK-SD-NEXT:    mov v1.h[1], w23
+; CHECK-SD-NEXT:    mov v6.h[1], w12
+; CHECK-SD-NEXT:    mov v7.h[1], w11
+; CHECK-SD-NEXT:    ldr w8, [sp, #128]
+; CHECK-SD-NEXT:    ldr w9, [sp, #160]
+; CHECK-SD-NEXT:    ldr w16, [sp, #64]
+; CHECK-SD-NEXT:    ldr w18, [sp, #96]
+; CHECK-SD-NEXT:    ldr w10, [sp, #192]
+; CHECK-SD-NEXT:    ldr w11, [sp, #224]
+; CHECK-SD-NEXT:    mov v2.h[2], w6
+; CHECK-SD-NEXT:    mov v3.h[2], w2
+; CHECK-SD-NEXT:    mov v0.h[2], w18
+; CHECK-SD-NEXT:    mov v1.h[2], w16
+; CHECK-SD-NEXT:    mov v5.h[2], w8
+; CHECK-SD-NEXT:    mov v4.h[2], w11
+; CHECK-SD-NEXT:    mov v6.h[2], w10
+; CHECK-SD-NEXT:    mov v7.h[2], w9
+; CHECK-SD-NEXT:    ldr w12, [sp, #72]
+; CHECK-SD-NEXT:    ldr w13, [sp, #104]
+; CHECK-SD-NEXT:    ldr w8, [sp, #136]
+; CHECK-SD-NEXT:    ldr w9, [sp, #168]
+; CHECK-SD-NEXT:    ldr w10, [sp, #200]
+; CHECK-SD-NEXT:    ldr w11, [sp, #232]
+; CHECK-SD-NEXT:    mov v0.h[3], w13
+; CHECK-SD-NEXT:    mov v1.h[3], w12
+; CHECK-SD-NEXT:    mov v2.h[3], w7
+; CHECK-SD-NEXT:    mov v3.h[3], w3
+; CHECK-SD-NEXT:    mov v5.h[3], w8
+; CHECK-SD-NEXT:    mov v4.h[3], w11
+; CHECK-SD-NEXT:    mov v6.h[3], w10
+; CHECK-SD-NEXT:    mov v7.h[3], w9
+; CHECK-SD-NEXT:    movi v16.4s, #15, msl #8
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-SD-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-SD-NEXT:    and v17.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT:    and v18.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT:    and v1.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT:    and v0.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT:    and v2.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT:    and v3.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT:    and v4.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT:    and v5.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    add v3.4s, v17.4s, v3.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    add v2.4s, v18.4s, v4.4s
+; CHECK-SD-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i12:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s4, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s20, [sp, #8]
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s21, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #64]
+; CHECK-GI-NEXT:    ldr s22, [sp, #72]
+; CHECK-GI-NEXT:    ldr s17, [sp, #96]
+; CHECK-GI-NEXT:    ldr s23, [sp, #104]
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v4.s[1], w5
+; CHECK-GI-NEXT:    ldr s18, [sp, #128]
+; CHECK-GI-NEXT:    ldr s24, [sp, #136]
+; CHECK-GI-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #160]
+; CHECK-GI-NEXT:    ldr s25, [sp, #168]
+; CHECK-GI-NEXT:    mov v2.s[1], v21.s[0]
+; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v23.s[0]
+; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], v25.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s20, [sp, #80]
+; CHECK-GI-NEXT:    ldr s21, [sp, #112]
+; CHECK-GI-NEXT:    ldr s22, [sp, #144]
+; CHECK-GI-NEXT:    ldr s23, [sp, #176]
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v4.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v16.s[2], v20.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v18.s[2], v22.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], v23.s[0]
+; CHECK-GI-NEXT:    ldr s3, [sp, #24]
+; CHECK-GI-NEXT:    ldr s5, [sp, #56]
+; CHECK-GI-NEXT:    ldr s6, [sp, #88]
+; CHECK-GI-NEXT:    ldr s7, [sp, #120]
+; CHECK-GI-NEXT:    ldr s20, [sp, #152]
+; CHECK-GI-NEXT:    ldr s21, [sp, #184]
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    mov v4.s[3], w7
+; CHECK-GI-NEXT:    movi v22.4s, #15, msl #8
+; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-GI-NEXT:    mov v16.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v18.s[3], v20.s[0]
+; CHECK-GI-NEXT:    mov v19.s[3], v21.s[0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v22.16b
+; CHECK-GI-NEXT:    and v3.16b, v4.16b, v22.16b
+; CHECK-GI-NEXT:    and v4.16b, v0.16b, v22.16b
+; CHECK-GI-NEXT:    and v5.16b, v2.16b, v22.16b
+; CHECK-GI-NEXT:    and v0.16b, v16.16b, v22.16b
+; CHECK-GI-NEXT:    and v2.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT:    and v6.16b, v18.16b, v22.16b
+; CHECK-GI-NEXT:    and v7.16b, v19.16b, v22.16b
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    add v2.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>
   %s1s = zext <16 x i12> %s1 to <16 x i32>
@@ -634,15 +1599,27 @@ entry:
 }
 
 define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_zz:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_zz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_zz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    usubl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    usubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = zext <16 x i8> %s1 to <16 x i32>
@@ -651,15 +1628,27 @@ entry:
 }
 
 define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_ss:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ssubl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_ss:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_ss:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    ssubl v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    ssubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -668,17 +1657,33 @@ entry:
 }
 
 define <16 x i32> @sub_zs(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_zs:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ssubw v2.8h, v2.8h, v1.8b
-; CHECK-NEXT:    ssubw2 v4.8h, v0.8h, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_zs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ssubw v2.8h, v2.8h, v1.8b
+; CHECK-SD-NEXT:    ssubw2 v4.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_zs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ssubw v0.4s, v1.4s, v3.4h
+; CHECK-GI-NEXT:    ssubw2 v1.4s, v2.4s, v3.8h
+; CHECK-GI-NEXT:    ssubw v2.4s, v5.4s, v4.4h
+; CHECK-GI-NEXT:    ssubw2 v3.4s, v6.4s, v4.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>

From 753ac4786e250604224701616f0962e41e163a02 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Wed, 29 May 2024 16:54:14 +0300
Subject: [PATCH 128/230] [RISCV][test] Add missing check-prefix to a test
 (NFC) (#93683)

---
 .../RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
index 7d05edd3f34132..f96d6597821788 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
-# RUN:   | FileCheck %s --check-prefix=RV32I
+# RUN:   | FileCheck %s --check-prefixes=CHECK,RV32I
 # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o -\
-# RUN:   | FileCheck %s --check-prefix=RV32ZBB
+# RUN:   | FileCheck %s --check-prefixes=CHECK,RV32ZBB
 
 ---
 name:            abs_i8
@@ -124,10 +124,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[C1]](s32)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[ASHR]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[ASHR]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[ASHR1]]
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]]
-    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD2]], [[ASHR1]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[ASHR]]
+    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[ASHR1]]
     ; CHECK-NEXT: $x10 = COPY [[XOR]](s32)
     ; CHECK-NEXT: $x11 = COPY [[XOR1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11

From df9701bfee2b13282a9c1bf981d37b965cb22bf7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 29 May 2024 08:57:16 -0500
Subject: [PATCH 129/230] [OpenMP] Fix multiply installing `libomp.so` (#93685)

Summary:
The `add_llvm_library` interface handles installing the llvm libraries,
however we want to do our own handling. Otherwise, this will install
into the `./lib` location instead of the `./lib/<target>` one.
---
 openmp/runtime/src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 612d784be8a55c..62c35c19e6b456 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -177,6 +177,7 @@ else()
   add_llvm_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES} PARTIAL_SOURCES_INTENDED
     LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${LIBOMP_DL_LIBS}
     LINK_COMPONENTS Support
+    BUILDTREE_ONLY
     )
   # libomp must be a C++ library such that it can link libLLVMSupport
   set(LIBOMP_LINKER_LANGUAGE CXX)

From 7af5b68a03bb7f5090a96b3f9f9a34f0e196e466 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 16:00:41 +0200
Subject: [PATCH 130/230] [DFSan] Directly create gep inbounds for arg origin
 tls (NFCI)

Calling code explicitly checks that ArgNo is inbounds. NFCI
because constant expression creation already infers it, this just
makes it explicit.
---
 llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 20d11e0ab55f2b..f0b0917a25938c 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1804,8 +1804,8 @@ Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
 Value *DFSanFunction::getRetvalOriginTLS() { return DFS.RetvalOriginTLS; }
 
 Value *DFSanFunction::getArgOriginTLS(unsigned ArgNo, IRBuilder<> &IRB) {
-  return IRB.CreateConstGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0, ArgNo,
-                                "_dfsarg_o");
+  return IRB.CreateConstInBoundsGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0,
+                                        ArgNo, "_dfsarg_o");
 }
 
 Value *DFSanFunction::getOrigin(Value *V) {

From fbe98da623c014a3e935b1e683aecdacee17f5bd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 29 May 2024 14:57:22 +0100
Subject: [PATCH 131/230] [AMDGPU] Fix filecheck annotation typos

Co-authored-by: klensy <nightouser@gmail.com>
---
 .../AMDGPU/irreducible/diverged-entry-headers.ll     |  2 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll            |  4 ++--
 llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir       | 10 +++++-----
 llvm/test/MC/AMDGPU/hsa-diag-v4.s                    |  2 +-
 llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt    | 12 ++++++------
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt   |  2 +-
 llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll        |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
index 335026dc9b62bd..efad77b684a75a 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
@@ -90,7 +90,7 @@ S:
   br i1 %cond.uni, label %exit, label %T
 
 T:
-; CHECK-NIT:   DIVERGENT:   %tt.phi = phi i32
+; CHECK-NOT:   DIVERGENT:   %tt.phi = phi i32
   %tt.phi = phi i32 [ %ss, %S ], [ %a, %entry ]
   %tt = add i32 %b, 1
   br label %P
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 50423c59eabe94..526d5c946ec7f6 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -108,7 +108,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 }
 
 ; no-op
-; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
+; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
@@ -119,7 +119,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
   ret void
 }
 
-; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
+; HSA-LABEL: {{^}}use_constant_to_global_addrspacecast:
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 29621a0477418d..1151bde02ef62c 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -4,7 +4,7 @@
 
 ---
 
-# GCN-label: name: vop3
+# GCN-LABEL: name: vop3
 # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
 # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
@@ -37,7 +37,7 @@ body:             |
 ...
 ---
 
-# GCN-label: name: vop3_sgpr_src1
+# GCN-LABEL: name: vop3_sgpr_src1
 # GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
 # GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
@@ -81,7 +81,7 @@ body:             |
 ---
 
 # Regression test for src_modifiers on base u16 opcode
-# GCN-label: name: vop3_u16
+# GCN-LABEL: name: vop3_u16
 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
 # GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
 # GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
@@ -205,7 +205,7 @@ body:             |
 ...
 
 # do not combine, dpp arg used twice
-# GCN-label: name: dpp_arg_twice
+# GCN-LABEL: name: dpp_arg_twice
 # GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
 # GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
 # GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
@@ -231,7 +231,7 @@ body:             |
 ...
 
 # when the dpp source isn't a src0 operand the operation should be commuted if possible
-# GCN-label: name: dpp_commute_e64
+# GCN-LABEL: name: dpp_commute_e64
 # GCN: %4:vgpr_32  = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
 # GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index 069b71b7229cdd..cc10d3400e9b1a 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -54,7 +54,7 @@
 
 // GCN-LABEL: warning: test_amdhsa_group_segment_fixed_size_repeated
 // AMDHSA: error: .amdhsa_ directives cannot be repeated
-// NONAMDHSA-: error: unknown directive
+// NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_group_segment_fixed_size_repeated"
 .amdhsa_kernel test_amdhsa_group_segment_fixed_size_repeated
   .amdhsa_group_segment_fixed_size 1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
index 7d15f041bd770e..78ca1bbdacf295 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
@@ -91,20 +91,20 @@
 
 # FIXME: Results in invalid v_subrev_u16_dpp which apparently has the same encoding but does not exist in GFX10
 
-# gfx1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00
 
 # FIXME: Results in v_mul_lo_u16_dpp
 
-# gfx1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00
 
 # FIXME: gives v_lshlrev_b16_dpp
 
-# gfx1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00
 
 # GFX1032: v_add_co_u32 v0, s0, v0, v2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index 36c58d4c673263..473ede00603a78 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -1674,7 +1674,7 @@
 # GFX12: ds_pk_add_f16 v0, v0 offset:4660        ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
 0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00
 
-# gfx12: ds_pk_add_bf16 v2, v1                   ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
+# GFX12: ds_pk_add_bf16 v2, v1                   ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
 0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00
 
 # GFX12: ds_pk_add_f16 v0, v0 offset:4660        ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index bb370a6d1dfeb0..7f7790cecb0eb8 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -670,7 +670,7 @@ declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
 define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %slc
-  ; CHECK-ENXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
+  ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
   call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
   ret void
 }

From e8e5ba00db1b6a8ed5c988b1a252c86487d1bce7 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Wed, 29 May 2024 15:06:41 +0100
Subject: [PATCH 132/230] [AArch64][TargetParser] Move ExtensionDependencies
 into tablegen [NFC] (#93614)

This patch generates ExtensionDependency pairs {Earlier, Later} inferred
by the 'Implies' field of every Extension defined in tablegen. Implied
Subtarget Features that are not Extensions are skipped.
---
 .../llvm/TargetParser/AArch64TargetParser.h   | 51 +------------------
 llvm/lib/TargetParser/AArch64TargetParser.cpp |  6 ---
 llvm/utils/TableGen/ARMTargetDefEmitter.cpp   | 18 +++++++
 3 files changed, 20 insertions(+), 55 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index b3fff3c99025a5..5025ab2491de8f 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -183,55 +183,8 @@ struct ExtensionDependency {
   ArchExtKind Later;
 };
 
-// clang-format off
-// Each entry here is a link in the dependency chain starting from the
-// extension that was added to the architecture first.
-inline constexpr ExtensionDependency ExtensionDependencies[] = {
-  {AEK_FP, AEK_FP16},
-  {AEK_FP, AEK_SIMD},
-  {AEK_FP, AEK_JSCVT},
-  {AEK_FP, AEK_FP8},
-  {AEK_SIMD, AEK_CRYPTO},
-  {AEK_SIMD, AEK_AES},
-  {AEK_SIMD, AEK_SHA2},
-  {AEK_SIMD, AEK_SHA3},
-  {AEK_SIMD, AEK_SM4},
-  {AEK_SIMD, AEK_RDM},
-  {AEK_SIMD, AEK_DOTPROD},
-  {AEK_SIMD, AEK_FCMA},
-  {AEK_FP16, AEK_FP16FML},
-  {AEK_FP16, AEK_SVE},
-  {AEK_BF16, AEK_SME},
-  {AEK_BF16, AEK_B16B16},
-  {AEK_SVE, AEK_SVE2},
-  {AEK_SVE, AEK_F32MM},
-  {AEK_SVE, AEK_F64MM},
-  {AEK_SVE2, AEK_SVE2P1},
-  {AEK_SVE2, AEK_SVE2BITPERM},
-  {AEK_SVE2, AEK_SVE2AES},
-  {AEK_SVE2, AEK_SVE2SHA3},
-  {AEK_SVE2, AEK_SVE2SM4},
-  {AEK_SVE2, AEK_SMEFA64},
-  {AEK_SVE2, AEK_SMEFA64},
-  {AEK_SME, AEK_SME2},
-  {AEK_SME, AEK_SMEF16F16},
-  {AEK_SME, AEK_SMEF64F64},
-  {AEK_SME, AEK_SMEI16I64},
-  {AEK_SME, AEK_SMEFA64},
-  {AEK_SME2, AEK_SME2P1},
-  {AEK_SME2, AEK_SSVE_FP8FMA},
-  {AEK_SME2, AEK_SSVE_FP8DOT2},
-  {AEK_SME2, AEK_SSVE_FP8DOT4},
-  {AEK_SME2, AEK_SMEF8F16},
-  {AEK_SME2, AEK_SMEF8F32},
-  {AEK_FP8, AEK_SMEF8F16},
-  {AEK_FP8, AEK_SMEF8F32},
-  {AEK_LSE, AEK_LSE128},
-  {AEK_PREDRES, AEK_SPECRES2},
-  {AEK_RAS, AEK_RASV2},
-  {AEK_RCPC, AEK_RCPC3},
-};
-// clang-format on
+#define EMIT_EXTENSION_DEPENDENCIES
+#include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
 enum ArchProfile { AProfile = 'A', RProfile = 'R', InvalidProfile = '?' };
 
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index c10b4be4eded99..ca356ec82bf1f9 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -181,12 +181,6 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) {
         !BaseArch->is_superset(ARMV9A))
       enable(AEK_FP16FML);
 
-    // For all architectures, +crypto enables +aes and +sha2.
-    if (E == AEK_CRYPTO) {
-      enable(AEK_AES);
-      enable(AEK_SHA2);
-    }
-
     // For v8.4A+ and v9.0A+, +crypto also enables +sha3 and +sm4.
     if (E == AEK_CRYPTO && BaseArch->is_superset(ARMV8_4A)) {
       enable(AEK_SHA3);
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index b79458529623f3..5efa7d2722d3f9 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -116,6 +116,24 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
      << "#endif // EMIT_EXTENSIONS\n"
      << "\n";
 
+  // Emit extension dependencies
+  OS << "#ifdef EMIT_EXTENSION_DEPENDENCIES\n"
+     << "inline constexpr ExtensionDependency ExtensionDependencies[] = {\n";
+  for (const Record *Rec : SortedExtensions) {
+    auto LaterAEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
+    for (const Record *I : Rec->getValueAsListOfDefs("Implies"))
+      if (auto EarlierAEK = I->getValueAsOptionalString("ArchExtKindSpelling"))
+        OS << "  {" << EarlierAEK->upper() << ", " << LaterAEK << "},\n";
+  }
+  // FIXME: Tablegen has the Subtarget Feature FeatureRCPC_IMMO which is implied
+  // by FeatureRCPC3 and in turn implies FeatureRCPC. The proper fix is to make
+  // FeatureRCPC_IMMO an Extension but that will expose it to the command line.
+  OS << "  {AEK_RCPC, AEK_RCPC3},\n";
+  OS << "};\n"
+     << "#undef EMIT_EXTENSION_DEPENDENCIES\n"
+     << "#endif // EMIT_EXTENSION_DEPENDENCIES\n"
+     << "\n";
+
   // Emit architecture information
   OS << "#ifdef EMIT_ARCHITECTURES\n";
 

From e20f0fe29f714a22679214b499744735d528fc1a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 16:12:32 +0200
Subject: [PATCH 133/230] [WasmEHPrepare] Explicitly create inbounds GEP (NFCI)

These are known to be inbounds, create them as such. NFCI because
constant expression construction currently already infers this.

Also drop the unnecessary zero-index GEP: This is equivalent to
the pointer itself nowadays.
---
 llvm/lib/CodeGen/WasmEHPrepare.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 1a9e1ba869c310..16c1dcb1e11753 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -252,12 +252,11 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
   LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
 
-  LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
-                                          "lpad_index_gep");
-  LSDAField =
-      IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 1, "lsda_gep");
-  SelectorField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 2,
-                                         "selector_gep");
+  LPadIndexField = LPadContextGV;
+  LSDAField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV, 0, 1,
+                                             "lsda_gep");
+  SelectorField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV,
+                                                 0, 2, "selector_gep");
 
   // wasm.landingpad.index() intrinsic, which is to specify landingpad index
   LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index);

From 14dc97df5ef3a9178fc4175303f0f86ed4e3f98e Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Wed, 29 May 2024 15:15:53 +0100
Subject: [PATCH 134/230] [AArch64][GlobalISel] Push ADD/SUB through Extend
 Instructions (#90964)

The regression in one test is due to a SUB instruction being pushed
through the extend, leaving behind the abs instruction, which prevents
it from selecting uabdl instructions shown below:

`i32 abs(i32 sub(i32 ext i8, i32 ext i8))` =>
`i32 abs(i32 ext(i16 sub(i16 ext i8, i16 ext i8)))`

This is intended to be fixed in a follow up patch
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  19 +-
 .../GISel/AArch64PreLegalizerCombiner.cpp     |  51 ++
 .../AArch64/GlobalISel/combine-add.mir        |  36 +-
 llvm/test/CodeGen/AArch64/aarch64-addv.ll     |  25 +-
 llvm/test/CodeGen/AArch64/arm64-vabs.ll       |  82 +--
 llvm/test/CodeGen/AArch64/neon-extadd.ll      | 622 ++++++++----------
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 142 ++--
 7 files changed, 494 insertions(+), 483 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1ce6cdf1c1e1ed..3f717c8a60050f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -52,6 +52,19 @@ def ext_uaddv_to_uaddlv : GICombineRule<
   (apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
 >;
 
+class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICombineRule <
+  (defs root:$root),
+  (match (extOpcode $ext1, $src1):$ExtMI,
+         (extOpcode $ext2, $src2),
+         (opcode $dst, $ext1, $ext2):$root,
+         [{ return matchPushAddSubExt(*${root}, MRI, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+  (apply [{ applyPushAddSubExt(*${root}, MRI, B, ${ExtMI}->getOpcode() == TargetOpcode::G_SEXT, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])>;
+
+def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
+def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
+def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
+def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;
+
 def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
                                       fconstant_to_constant,
@@ -59,7 +72,11 @@ def AArch64PreLegalizerCombiner: GICombiner<
                                       fold_global_offset,
                                       shuffle_to_extract,
                                       ext_addv_to_udot_addv,
-                                      ext_uaddv_to_uaddlv]> {
+                                      ext_uaddv_to_uaddlv,
+                                      push_sub_through_zext,
+                                      push_add_through_zext,
+                                      push_sub_through_sext,
+                                      push_add_through_sext]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index a82d3cd095659b..0f89fa557cd57e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -554,6 +554,57 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Pushes ADD/SUB through extend instructions to decrease the number of extend
+// instruction at the end by allowing selection of {s|u}addl sooner
+
+// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
+bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        Register DstReg, Register SrcReg1, Register SrcReg2) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD ||
+         MI.getOpcode() == TargetOpcode::G_SUB &&
+             "Expected a G_ADD or G_SUB instruction\n");
+
+  // Deal with vector types only
+  LLT DstTy = MRI.getType(DstReg);
+  if (!DstTy.isVector())
+    return false;
+
+  // Return true if G_{S|Z}EXT instruction is more than 2* source
+  Register ExtDstReg = MI.getOperand(1).getReg();
+  LLT Ext1SrcTy = MRI.getType(SrcReg1);
+  LLT Ext2SrcTy = MRI.getType(SrcReg2);
+  unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits();
+  unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
+  if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
+       ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
+      Ext1SrcTy == Ext2SrcTy)
+    return true;
+
+  return false;
+}
+
+void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        MachineIRBuilder &B, bool isSExt, Register DstReg,
+                        Register SrcReg1, Register SrcReg2) {
+  LLT SrcTy = MRI.getType(SrcReg1);
+  LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
+  unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0);
+  Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0);
+  Register AddReg =
+      B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
+
+  // G_SUB has to sign-extend the result.
+  // G_ADD needs to sext from sext and can sext or zext from zext, so the
+  // original opcode is used.
+  if (MI.getOpcode() == TargetOpcode::G_ADD)
+    B.buildInstr(Opc, {DstReg}, {AddReg});
+  else
+    B.buildSExt(DstReg, AddReg);
+
+  MI.eraseFromParent();
+}
+
 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
index 78411f34bebd31..a0142afd067770 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
@@ -219,10 +219,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
-    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
-    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[SEXT]], [[SEXT1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[ADD]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
     ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
     ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -249,10 +250,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
-    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
-    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[ZEXT]], [[ZEXT1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ADD]](<8 x s32>)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[ADD]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ZEXT2]](<8 x s32>)
     ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
     ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -279,10 +281,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
-    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY]](<8 x s8>)
-    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[COPY1]](<8 x s8>)
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[SEXT]], [[SEXT1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
     ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
     ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
@@ -309,10 +312,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY]](<8 x s8>)
-    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[COPY1]](<8 x s8>)
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[ZEXT]], [[ZEXT1]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SUB]](<8 x s32>)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT]](<8 x s32>)
     ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
     ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 94b792b887eb47..def4192b0e005d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -94,18 +94,19 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
 ;
 ; GISEL-LABEL: oversized_ADDV_256:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    ldr d0, [x0]
-; GISEL-NEXT:    ldr d1, [x1]
-; GISEL-NEXT:    ushll v0.8h, v0.8b, #0
-; GISEL-NEXT:    ushll v1.8h, v1.8b, #0
-; GISEL-NEXT:    usubl v2.4s, v0.4h, v1.4h
-; GISEL-NEXT:    usubl2 v0.4s, v0.8h, v1.8h
-; GISEL-NEXT:    cmlt v1.4s, v2.4s, #0
-; GISEL-NEXT:    cmlt v3.4s, v0.4s, #0
-; GISEL-NEXT:    neg v4.4s, v2.4s
-; GISEL-NEXT:    neg v5.4s, v0.4s
-; GISEL-NEXT:    bsl v1.16b, v4.16b, v2.16b
-; GISEL-NEXT:    bit v0.16b, v5.16b, v3.16b
+; GISEL-NEXT:    ldr d1, [x0]
+; GISEL-NEXT:    ldr d2, [x1]
+; GISEL-NEXT:    movi v0.2d, #0000000000000000
+; GISEL-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; GISEL-NEXT:    sshll v2.4s, v1.4h, #0
+; GISEL-NEXT:    sshll2 v3.4s, v1.8h, #0
+; GISEL-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
+; GISEL-NEXT:    cmlt v4.4s, v2.4s, #0
+; GISEL-NEXT:    cmlt v5.4s, v3.4s, #0
+; GISEL-NEXT:    neg v6.4s, v2.4s
+; GISEL-NEXT:    mov v1.16b, v4.16b
+; GISEL-NEXT:    bif v0.16b, v3.16b, v5.16b
+; GISEL-NEXT:    bsl v1.16b, v6.16b, v2.16b
 ; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
 ; GISEL-NEXT:    addv s0, v0.4s
 ; GISEL-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index f7d31a214563bc..178c229d04e471 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -289,26 +289,27 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: uabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.8h v2, v0, #0
-; CHECK-GI-NEXT:    ushll.8h v3, v1, #0
-; CHECK-GI-NEXT:    ushll2.8h v0, v0, #0
-; CHECK-GI-NEXT:    ushll2.8h v1, v1, #0
-; CHECK-GI-NEXT:    usubl.4s v4, v2, v3
-; CHECK-GI-NEXT:    usubl2.4s v2, v2, v3
-; CHECK-GI-NEXT:    usubl.4s v3, v0, v1
-; CHECK-GI-NEXT:    usubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmlt.4s v1, v4, #0
-; CHECK-GI-NEXT:    cmlt.4s v5, v2, #0
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    cmlt.4s v6, v3, #0
-; CHECK-GI-NEXT:    cmlt.4s v7, v0, #0
-; CHECK-GI-NEXT:    neg.4s v17, v2
-; CHECK-GI-NEXT:    neg.4s v18, v3
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v4
-; CHECK-GI-NEXT:    bit.16b v2, v17, v5
-; CHECK-GI-NEXT:    bit.16b v3, v18, v6
-; CHECK-GI-NEXT:    bit.16b v0, v19, v7
+; CHECK-GI-NEXT:    usubl.8h v3, v0, v1
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-GI-NEXT:    usubl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshll.4s v1, v3, #0
+; CHECK-GI-NEXT:    sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT:    sshll.4s v5, v0, #0
+; CHECK-GI-NEXT:    sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT:    ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT:    ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT:    cmlt.4s v2, v1, #0
+; CHECK-GI-NEXT:    cmlt.4s v7, v4, #0
+; CHECK-GI-NEXT:    neg.4s v16, v1
+; CHECK-GI-NEXT:    cmlt.4s v17, v5, #0
+; CHECK-GI-NEXT:    cmlt.4s v18, v6, #0
+; CHECK-GI-NEXT:    neg.4s v19, v5
+; CHECK-GI-NEXT:    bit.16b v1, v16, v2
+; CHECK-GI-NEXT:    mov.16b v2, v7
+; CHECK-GI-NEXT:    bif.16b v0, v6, v18
+; CHECK-GI-NEXT:    bsl.16b v2, v3, v4
+; CHECK-GI-NEXT:    mov.16b v3, v17
+; CHECK-GI-NEXT:    bsl.16b v3, v19, v5
 ; CHECK-GI-NEXT:    add.4s v1, v1, v2
 ; CHECK-GI-NEXT:    add.4s v0, v3, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -336,26 +337,27 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: sabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll.8h v2, v0, #0
-; CHECK-GI-NEXT:    sshll.8h v3, v1, #0
-; CHECK-GI-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-GI-NEXT:    sshll2.8h v1, v1, #0
-; CHECK-GI-NEXT:    ssubl.4s v4, v2, v3
-; CHECK-GI-NEXT:    ssubl2.4s v2, v2, v3
-; CHECK-GI-NEXT:    ssubl.4s v3, v0, v1
-; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmlt.4s v1, v4, #0
-; CHECK-GI-NEXT:    cmlt.4s v5, v2, #0
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    cmlt.4s v6, v3, #0
-; CHECK-GI-NEXT:    cmlt.4s v7, v0, #0
-; CHECK-GI-NEXT:    neg.4s v17, v2
-; CHECK-GI-NEXT:    neg.4s v18, v3
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v4
-; CHECK-GI-NEXT:    bit.16b v2, v17, v5
-; CHECK-GI-NEXT:    bit.16b v3, v18, v6
-; CHECK-GI-NEXT:    bit.16b v0, v19, v7
+; CHECK-GI-NEXT:    ssubl.8h v3, v0, v1
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-GI-NEXT:    ssubl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshll.4s v1, v3, #0
+; CHECK-GI-NEXT:    sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT:    sshll.4s v5, v0, #0
+; CHECK-GI-NEXT:    sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT:    ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT:    ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT:    cmlt.4s v2, v1, #0
+; CHECK-GI-NEXT:    cmlt.4s v7, v4, #0
+; CHECK-GI-NEXT:    neg.4s v16, v1
+; CHECK-GI-NEXT:    cmlt.4s v17, v5, #0
+; CHECK-GI-NEXT:    cmlt.4s v18, v6, #0
+; CHECK-GI-NEXT:    neg.4s v19, v5
+; CHECK-GI-NEXT:    bit.16b v1, v16, v2
+; CHECK-GI-NEXT:    mov.16b v2, v7
+; CHECK-GI-NEXT:    bif.16b v0, v6, v18
+; CHECK-GI-NEXT:    bsl.16b v2, v3, v4
+; CHECK-GI-NEXT:    mov.16b v3, v17
+; CHECK-GI-NEXT:    bsl.16b v3, v19, v5
 ; CHECK-GI-NEXT:    add.4s v1, v1, v2
 ; CHECK-GI-NEXT:    add.4s v0, v3, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 6aa9c394a8fd1f..402682c89124bd 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -134,10 +134,9 @@ define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extadds_v8i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    saddl v0.4s, v2.4h, v1.4h
-; CHECK-GI-NEXT:    saddl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    saddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i32>
@@ -156,10 +155,9 @@ define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extaddu_v8i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    uaddl v0.4s, v2.4h, v1.4h
-; CHECK-GI-NEXT:    uaddl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    uaddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i32>
@@ -178,10 +176,9 @@ define <8 x i32> @extsubs_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubs_v8i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    ssubl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i32>
@@ -200,10 +197,9 @@ define <8 x i32> @extsubu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubu_v8i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v1.4h
-; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v1.8h
+; CHECK-GI-NEXT:    usubl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i32>
@@ -225,14 +221,12 @@ define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extadds_v16i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    saddl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    saddl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    saddl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    saddl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
@@ -254,14 +248,12 @@ define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extaddu_v16i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    uaddl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    uaddl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    uaddl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    uaddl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -283,14 +275,12 @@ define <16 x i32> @extsubs_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubs_v16i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    ssubl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    ssubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
@@ -312,14 +302,12 @@ define <16 x i32> @extsubu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubu_v16i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    usubl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    usubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -342,16 +330,13 @@ define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extadds_v8i8_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    saddl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -374,16 +359,13 @@ define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extaddu_v8i8_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    uaddl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i64>
@@ -406,16 +388,13 @@ define <8 x i64> @extsubs_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubs_v8i8_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ssubl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    ssubl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    ssubl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    ssubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -438,16 +417,13 @@ define <8 x i64> @extsubu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubu_v8i8_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    usubl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    usubl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    usubl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    usubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i64>
@@ -477,26 +453,20 @@ define <16 x i64> @extaddu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: extaddu_v16i8_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v16.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v17.4s, v1.8h, #0
-; CHECK-GI-NEXT:    uaddl v0.2d, v4.2s, v2.2s
-; CHECK-GI-NEXT:    uaddl2 v1.2d, v4.4s, v2.4s
-; CHECK-GI-NEXT:    uaddl v2.2d, v5.2s, v3.2s
-; CHECK-GI-NEXT:    uaddl2 v3.2d, v5.4s, v3.4s
-; CHECK-GI-NEXT:    uaddl v4.2d, v6.2s, v7.2s
-; CHECK-GI-NEXT:    uaddl2 v5.2d, v6.4s, v7.4s
-; CHECK-GI-NEXT:    uaddl v6.2d, v16.2s, v17.2s
-; CHECK-GI-NEXT:    uaddl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = zext <16 x i8> %a to <16 x i64>
     %d = zext <16 x i8> %b to <16 x i64>
@@ -525,26 +495,20 @@ define <16 x i64> @extadds_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: extadds_v16i8_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v17.4s, v1.8h, #0
-; CHECK-GI-NEXT:    saddl v0.2d, v4.2s, v2.2s
-; CHECK-GI-NEXT:    saddl2 v1.2d, v4.4s, v2.4s
-; CHECK-GI-NEXT:    saddl v2.2d, v5.2s, v3.2s
-; CHECK-GI-NEXT:    saddl2 v3.2d, v5.4s, v3.4s
-; CHECK-GI-NEXT:    saddl v4.2d, v6.2s, v7.2s
-; CHECK-GI-NEXT:    saddl2 v5.2d, v6.4s, v7.4s
-; CHECK-GI-NEXT:    saddl v6.2d, v16.2s, v17.2s
-; CHECK-GI-NEXT:    saddl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = sext <16 x i8> %a to <16 x i64>
     %d = sext <16 x i8> %b to <16 x i64>
@@ -573,26 +537,20 @@ define <16 x i64> @extsubu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: extsubu_v16i8_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v16.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v17.4s, v1.8h, #0
-; CHECK-GI-NEXT:    usubl v0.2d, v4.2s, v2.2s
-; CHECK-GI-NEXT:    usubl2 v1.2d, v4.4s, v2.4s
-; CHECK-GI-NEXT:    usubl v2.2d, v5.2s, v3.2s
-; CHECK-GI-NEXT:    usubl2 v3.2d, v5.4s, v3.4s
-; CHECK-GI-NEXT:    usubl v4.2d, v6.2s, v7.2s
-; CHECK-GI-NEXT:    usubl2 v5.2d, v6.4s, v7.4s
-; CHECK-GI-NEXT:    usubl v6.2d, v16.2s, v17.2s
-; CHECK-GI-NEXT:    usubl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = zext <16 x i8> %a to <16 x i64>
     %d = zext <16 x i8> %b to <16 x i64>
@@ -621,26 +579,20 @@ define <16 x i64> @extsubs_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: extsubs_v16i8_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v17.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ssubl v0.2d, v4.2s, v2.2s
-; CHECK-GI-NEXT:    ssubl2 v1.2d, v4.4s, v2.4s
-; CHECK-GI-NEXT:    ssubl v2.2d, v5.2s, v3.2s
-; CHECK-GI-NEXT:    ssubl2 v3.2d, v5.4s, v3.4s
-; CHECK-GI-NEXT:    ssubl v4.2d, v6.2s, v7.2s
-; CHECK-GI-NEXT:    ssubl2 v5.2d, v6.4s, v7.4s
-; CHECK-GI-NEXT:    ssubl v6.2d, v16.2s, v17.2s
-; CHECK-GI-NEXT:    ssubl2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = sext <16 x i8> %a to <16 x i64>
     %d = sext <16 x i8> %b to <16 x i64>
@@ -667,22 +619,18 @@ define <16 x i64> @extaddu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: extaddu_v16i16_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v16.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v18.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v3.8h, #0
-; CHECK-GI-NEXT:    uaddl v0.2d, v4.2s, v6.2s
-; CHECK-GI-NEXT:    uaddl2 v1.2d, v4.4s, v6.4s
-; CHECK-GI-NEXT:    uaddl v2.2d, v5.2s, v16.2s
-; CHECK-GI-NEXT:    uaddl2 v3.2d, v5.4s, v16.4s
-; CHECK-GI-NEXT:    uaddl v4.2d, v7.2s, v17.2s
-; CHECK-GI-NEXT:    uaddl2 v5.2d, v7.4s, v17.4s
-; CHECK-GI-NEXT:    uaddl v6.2d, v18.2s, v19.2s
-; CHECK-GI-NEXT:    uaddl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    uaddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uaddl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    uaddl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = zext <16 x i16> %a to <16 x i64>
     %d = zext <16 x i16> %b to <16 x i64>
@@ -709,22 +657,18 @@ define <16 x i64> @extadds_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: extadds_v16i16_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v18.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll2 v19.4s, v3.8h, #0
-; CHECK-GI-NEXT:    saddl v0.2d, v4.2s, v6.2s
-; CHECK-GI-NEXT:    saddl2 v1.2d, v4.4s, v6.4s
-; CHECK-GI-NEXT:    saddl v2.2d, v5.2s, v16.2s
-; CHECK-GI-NEXT:    saddl2 v3.2d, v5.4s, v16.4s
-; CHECK-GI-NEXT:    saddl v4.2d, v7.2s, v17.2s
-; CHECK-GI-NEXT:    saddl2 v5.2d, v7.4s, v17.4s
-; CHECK-GI-NEXT:    saddl v6.2d, v18.2s, v19.2s
-; CHECK-GI-NEXT:    saddl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    saddl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    saddl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = sext <16 x i16> %a to <16 x i64>
     %d = sext <16 x i16> %b to <16 x i64>
@@ -751,22 +695,18 @@ define <16 x i64> @extsubu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: extsubu_v16i16_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v16.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v18.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v3.8h, #0
-; CHECK-GI-NEXT:    usubl v0.2d, v4.2s, v6.2s
-; CHECK-GI-NEXT:    usubl2 v1.2d, v4.4s, v6.4s
-; CHECK-GI-NEXT:    usubl v2.2d, v5.2s, v16.2s
-; CHECK-GI-NEXT:    usubl2 v3.2d, v5.4s, v16.4s
-; CHECK-GI-NEXT:    usubl v4.2d, v7.2s, v17.2s
-; CHECK-GI-NEXT:    usubl2 v5.2d, v7.4s, v17.4s
-; CHECK-GI-NEXT:    usubl v6.2d, v18.2s, v19.2s
-; CHECK-GI-NEXT:    usubl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    usubl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    usubl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    usubl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    usubl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = zext <16 x i16> %a to <16 x i64>
     %d = zext <16 x i16> %b to <16 x i64>
@@ -793,22 +733,18 @@ define <16 x i64> @extsubs_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: extsubs_v16i16_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v18.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll2 v19.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ssubl v0.2d, v4.2s, v6.2s
-; CHECK-GI-NEXT:    ssubl2 v1.2d, v4.4s, v6.4s
-; CHECK-GI-NEXT:    ssubl v2.2d, v5.2s, v16.2s
-; CHECK-GI-NEXT:    ssubl2 v3.2d, v5.4s, v16.4s
-; CHECK-GI-NEXT:    ssubl v4.2d, v7.2s, v17.2s
-; CHECK-GI-NEXT:    ssubl2 v5.2d, v7.4s, v17.4s
-; CHECK-GI-NEXT:    ssubl v6.2d, v18.2s, v19.2s
-; CHECK-GI-NEXT:    ssubl2 v7.2d, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ssubl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ssubl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ssubl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    ssubl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
 ; CHECK-GI-NEXT:    ret
     %c = sext <16 x i16> %a to <16 x i64>
     %d = sext <16 x i16> %b to <16 x i64>
@@ -948,10 +884,9 @@ define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extadds_v4i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v1.2s
-; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v1.4s
+; CHECK-GI-NEXT:    saddl v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <4 x i16> %s0 to <4 x i64>
@@ -970,10 +905,9 @@ define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extaddu_v4i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v1.2s
-; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v1.4s
+; CHECK-GI-NEXT:    uaddl v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <4 x i16> %s0 to <4 x i64>
@@ -995,14 +929,12 @@ define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extadds_v8i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    saddl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    saddl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    saddl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    saddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i64>
@@ -1024,14 +956,12 @@ define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extaddu_v8i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    uaddl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    uaddl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    uaddl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i64>
@@ -1053,14 +983,12 @@ define <8 x i64> @extsubs_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubs_v8i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ssubl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    ssubl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    ssubl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    ssubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    ssubl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ssubl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i64>
@@ -1082,14 +1010,12 @@ define <8 x i64> @extsubu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ;
 ; CHECK-GI-LABEL: extsubu_v8i16_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    usubl v0.2d, v2.2s, v3.2s
-; CHECK-GI-NEXT:    usubl2 v1.2d, v2.4s, v3.4s
-; CHECK-GI-NEXT:    usubl v2.2d, v4.2s, v5.2s
-; CHECK-GI-NEXT:    usubl2 v3.2d, v4.4s, v5.4s
+; CHECK-GI-NEXT:    usubl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    usubl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i64>
@@ -1343,86 +1269,92 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ; CHECK-GI-NEXT:    ldr s0, [sp]
 ; CHECK-GI-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
 ; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s19, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s3, w4
 ; CHECK-GI-NEXT:    mov v0.s[1], v4.s[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-NEXT:    ldr s4, [sp, #64]
-; CHECK-GI-NEXT:    ldr s19, [sp, #72]
-; CHECK-GI-NEXT:    ldr s21, [sp, #104]
-; CHECK-GI-NEXT:    mov v1.s[1], w1
-; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
 ; CHECK-GI-NEXT:    ldr s16, [sp, #96]
-; CHECK-GI-NEXT:    ldr s22, [sp, #136]
+; CHECK-GI-NEXT:    ldr s22, [sp, #104]
+; CHECK-GI-NEXT:    mov v2.s[1], v19.s[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #128]
+; CHECK-GI-NEXT:    ldr s23, [sp, #136]
+; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    mov v1.s[1], w1
 ; CHECK-GI-NEXT:    mov v3.s[1], w5
-; CHECK-GI-NEXT:    ldr s20, [sp, #48]
-; CHECK-GI-NEXT:    mov v4.s[1], v19.s[0]
+; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], v23.s[0]
+; CHECK-GI-NEXT:    ldr s4, [sp, #64]
+; CHECK-GI-NEXT:    ldr s21, [sp, #72]
 ; CHECK-GI-NEXT:    mov v0.s[2], v18.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #128]
-; CHECK-GI-NEXT:    ldr s19, [sp, #160]
+; CHECK-GI-NEXT:    ldr s18, [sp, #160]
 ; CHECK-GI-NEXT:    ldr s24, [sp, #168]
-; CHECK-GI-NEXT:    mov v16.s[1], v21.s[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #192]
-; CHECK-GI-NEXT:    mov v18.s[1], v22.s[0]
+; CHECK-GI-NEXT:    ldr s20, [sp, #192]
 ; CHECK-GI-NEXT:    ldr s25, [sp, #200]
 ; CHECK-GI-NEXT:    ldr s22, [sp, #224]
-; CHECK-GI-NEXT:    ldr s26, [sp, #232]
+; CHECK-GI-NEXT:    ldr s27, [sp, #232]
 ; CHECK-GI-NEXT:    ldr s23, [sp, #112]
-; CHECK-GI-NEXT:    mov v19.s[1], v24.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v20.s[0]
-; CHECK-GI-NEXT:    ldr s20, [sp, #144]
-; CHECK-GI-NEXT:    ldr s17, [sp, #80]
-; CHECK-GI-NEXT:    mov v21.s[1], v25.s[0]
-; CHECK-GI-NEXT:    mov v22.s[1], v26.s[0]
+; CHECK-GI-NEXT:    ldr s26, [sp, #144]
+; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
+; CHECK-GI-NEXT:    mov v20.s[1], v25.s[0]
+; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
+; CHECK-GI-NEXT:    mov v22.s[1], v27.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
 ; CHECK-GI-NEXT:    mov v3.s[2], w6
-; CHECK-GI-NEXT:    ldr s24, [sp, #176]
 ; CHECK-GI-NEXT:    mov v16.s[2], v23.s[0]
-; CHECK-GI-NEXT:    mov v18.s[2], v20.s[0]
-; CHECK-GI-NEXT:    mov v4.s[2], v17.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #208]
-; CHECK-GI-NEXT:    ldr s23, [sp, #240]
-; CHECK-GI-NEXT:    ldr s20, [sp, #120]
-; CHECK-GI-NEXT:    mov v19.s[2], v24.s[0]
-; CHECK-GI-NEXT:    ldr s24, [sp, #152]
+; CHECK-GI-NEXT:    mov v19.s[2], v26.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    ldr s21, [sp, #176]
+; CHECK-GI-NEXT:    ldr s24, [sp, #208]
+; CHECK-GI-NEXT:    ldr s25, [sp, #240]
+; CHECK-GI-NEXT:    mov v2.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-NEXT:    ldr s23, [sp, #152]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #24]
-; CHECK-GI-NEXT:    mov v21.s[2], v17.s[0]
-; CHECK-GI-NEXT:    mov v22.s[2], v23.s[0]
+; CHECK-GI-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v20.s[2], v24.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v22.s[2], v25.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[3], w3
-; CHECK-GI-NEXT:    mov v16.s[3], v20.s[0]
-; CHECK-GI-NEXT:    movi v17.2d, #0x0000ff000000ff
 ; CHECK-GI-NEXT:    mov v3.s[3], w7
-; CHECK-GI-NEXT:    mov v18.s[3], v24.s[0]
+; CHECK-GI-NEXT:    mov v16.s[3], v17.s[0]
+; CHECK-GI-NEXT:    mov v19.s[3], v23.s[0]
 ; CHECK-GI-NEXT:    ldr s6, [sp, #56]
-; CHECK-GI-NEXT:    ldr s7, [sp, #88]
-; CHECK-GI-NEXT:    ldr s25, [sp, #184]
-; CHECK-GI-NEXT:    ldr s20, [sp, #216]
+; CHECK-GI-NEXT:    ldr s7, [sp, #184]
+; CHECK-GI-NEXT:    ldr s21, [sp, #216]
+; CHECK-GI-NEXT:    ldr s17, [sp, #88]
 ; CHECK-GI-NEXT:    mov v0.s[3], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #248]
-; CHECK-GI-NEXT:    mov v19.s[3], v25.s[0]
 ; CHECK-GI-NEXT:    mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v21.s[3], v20.s[0]
+; CHECK-GI-NEXT:    mov v18.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v20.s[3], v21.s[0]
+; CHECK-GI-NEXT:    mov v4.s[3], v17.s[0]
 ; CHECK-GI-NEXT:    mov v22.s[3], v5.s[0]
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v17.16b
-; CHECK-GI-NEXT:    and v5.16b, v16.16b, v17.16b
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v17.16b
-; CHECK-GI-NEXT:    and v6.16b, v18.16b, v17.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v17.16b
-; CHECK-GI-NEXT:    and v7.16b, v19.16b, v17.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v17.16b
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v17.16b
-; CHECK-GI-NEXT:    and v16.16b, v21.16b, v17.16b
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-GI-NEXT:    and v5.16b, v22.16b, v17.16b
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v6.4s
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v16.4s
-; CHECK-GI-NEXT:    stp q1, q3, [x8]
-; CHECK-GI-NEXT:    add v1.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    stp q0, q2, [x8, #32]
-; CHECK-GI-NEXT:    str q1, [x8, #64]
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uzp1 v5.8h, v16.8h, v19.8h
+; CHECK-GI-NEXT:    dup v6.4s, w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uzp1 v2.8h, v18.8h, v20.8h
+; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-GI-NEXT:    uzp1 v6.8h, v22.8h, v6.8h
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    and v3.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    add v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    stp q2, q1, [x8]
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    stp q4, q0, [x8, #32]
+; CHECK-GI-NEXT:    str q2, [x8, #64]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
@@ -1611,14 +1543,12 @@ define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: sub_zz:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    usubl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    usubl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    usubl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    usubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -1640,14 +1570,12 @@ define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: sub_ss:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v4.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ssubl v0.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    ssubl2 v1.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    ssubl v2.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    ssubl2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index ab7cea8dfb7789..c9fe89aec8ad9b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4725,94 +4725,102 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-GI-NEXT:    sxtw x8, w3
 ; CHECK-GI-NEXT:    sxtw x9, w1
+; CHECK-GI-NEXT:    sxtw x8, w3
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    ldr d1, [x2]
 ; CHECK-GI-NEXT:    add x10, x0, x9
 ; CHECK-GI-NEXT:    add x11, x2, x8
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ldr d2, [x10]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x12, x11, x8
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr d3, [x11]
-; CHECK-GI-NEXT:    ldr d4, [x10]
-; CHECK-GI-NEXT:    ldr d5, [x12]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x12, x8
-; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-NEXT:    uabdl v6.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    uabdl2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    ldr d2, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    uabdl v16.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    uabdl2 v2.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    uabdl v3.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
-; CHECK-GI-NEXT:    ldr d5, [x10]
-; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ldr d3, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    ldr d2, [x10]
 ; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ldr d6, [x11]
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-NEXT:    ushll v17.8h, v17.8b, #0
-; CHECK-GI-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uabdl v4.4s, v1.4h, v7.4h
-; CHECK-GI-NEXT:    uabdl2 v1.4s, v1.8h, v7.8h
-; CHECK-GI-NEXT:    ldr d7, [x10]
+; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
+; CHECK-GI-NEXT:    abs v5.4s, v5.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    ldr d4, [x10]
 ; CHECK-GI-NEXT:    ldr d16, [x11]
+; CHECK-GI-NEXT:    abs v7.4s, v7.4s
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    ldr d18, [x10]
-; CHECK-GI-NEXT:    ldr d20, [x10, x9]
-; CHECK-GI-NEXT:    ldr d19, [x11]
-; CHECK-GI-NEXT:    ldr d21, [x11, x8]
-; CHECK-GI-NEXT:    uabdl v6.4s, v5.4h, v17.4h
-; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
-; CHECK-GI-NEXT:    ushll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    uabdl2 v5.4s, v5.8h, v17.8h
-; CHECK-GI-NEXT:    ushll v17.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll v18.8h, v19.8b, #0
-; CHECK-GI-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-GI-NEXT:    ushll v4.8h, v20.8b, #0
-; CHECK-GI-NEXT:    ushll v19.8h, v21.8b, #0
-; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
+; CHECK-GI-NEXT:    ldr d6, [x10]
+; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
+; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d5, [x10]
+; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
+; CHECK-GI-NEXT:    ldr d17, [x11, x8]
+; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
+; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
+; CHECK-GI-NEXT:    ldr d7, [x10, x9]
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    abs v16.4s, v16.4s
+; CHECK-GI-NEXT:    abs v3.4s, v3.4s
+; CHECK-GI-NEXT:    abs v18.4s, v18.4s
+; CHECK-GI-NEXT:    abs v2.4s, v2.4s
+; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
+; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
+; CHECK-GI-NEXT:    abs v19.4s, v19.4s
+; CHECK-GI-NEXT:    abs v4.4s, v4.4s
+; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
+; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
+; CHECK-GI-NEXT:    abs v17.4s, v17.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    abs v6.4s, v6.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-GI-NEXT:    addv s3, v3.4s
-; CHECK-GI-NEXT:    uabdl v20.4s, v7.4h, v16.4h
-; CHECK-GI-NEXT:    uabdl2 v7.4s, v7.8h, v16.8h
-; CHECK-GI-NEXT:    add v5.4s, v6.4s, v5.4s
-; CHECK-GI-NEXT:    uabdl v6.4s, v17.4h, v18.4h
-; CHECK-GI-NEXT:    uabdl2 v16.4s, v17.8h, v18.8h
-; CHECK-GI-NEXT:    uabdl v17.4s, v4.4h, v19.4h
-; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v19.8h
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
+; CHECK-GI-NEXT:    abs v16.4s, v16.4s
+; CHECK-GI-NEXT:    abs v5.4s, v5.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
+; CHECK-GI-NEXT:    addv s2, v2.4s
 ; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    addv s4, v4.4s
 ; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    add v7.4s, v20.4s, v7.4s
-; CHECK-GI-NEXT:    add v0.4s, v17.4s, v4.4s
-; CHECK-GI-NEXT:    addv s4, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    abs v18.4s, v18.4s
+; CHECK-GI-NEXT:    abs v7.4s, v7.4s
+; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    addv s3, v6.4s
+; CHECK-GI-NEXT:    fmov w9, s2
 ; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    addv s3, v7.4s
-; CHECK-GI-NEXT:    addv s1, v2.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    fmov w10, s4
+; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    add w8, w10, w8
+; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w8, w9, w8

From 0981dca7779d4acfcbb92fbb29a7a1033e283b88 Mon Sep 17 00:00:00 2001
From: donald chen <chenxunyu1993@gmail.com>
Date: Wed, 29 May 2024 22:20:49 +0800
Subject: [PATCH 135/230] [mlir][arith] Add neutral element support to
 arith.maxnumf/arith.minnumf (#93278)

For maxnumf and minnumf, the result of calculations involving NaN will
be another value, so their neutral element is set to NaN.
---
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp        | 14 +++
 .../Linalg/transform-op-split-reduction.mlir  | 92 +++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index a0b50251c6b670..5797c5681a5fdd 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2467,6 +2467,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
                            : APFloat::getInf(semantic, /*Negative=*/true);
     return builder.getFloatAttr(resultType, identity);
   }
+  case AtomicRMWKind::maxnumf: {
+    const llvm::fltSemantics &semantic =
+        llvm::cast<FloatType>(resultType).getFloatSemantics();
+    APFloat identity = APFloat::getNaN(semantic, /*Negative=*/true);
+    return builder.getFloatAttr(resultType, identity);
+  }
   case AtomicRMWKind::addf:
   case AtomicRMWKind::addi:
   case AtomicRMWKind::maxu:
@@ -2489,6 +2495,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
 
     return builder.getFloatAttr(resultType, identity);
   }
+  case AtomicRMWKind::minnumf: {
+    const llvm::fltSemantics &semantic =
+        llvm::cast<FloatType>(resultType).getFloatSemantics();
+    APFloat identity = APFloat::getNaN(semantic, /*Negative=*/false);
+    return builder.getFloatAttr(resultType, identity);
+  }
   case AtomicRMWKind::mins:
     return builder.getIntegerAttr(
         resultType, APInt::getSignedMaxValue(
@@ -2518,6 +2530,8 @@ std::optional<TypedAttr> mlir::arith::getNeutralElement(Operation *op) {
           .Case([](arith::MulFOp op) { return AtomicRMWKind::mulf; })
           .Case([](arith::MaximumFOp op) { return AtomicRMWKind::maximumf; })
           .Case([](arith::MinimumFOp op) { return AtomicRMWKind::minimumf; })
+          .Case([](arith::MaxNumFOp op) { return AtomicRMWKind::maxnumf; })
+          .Case([](arith::MinNumFOp op) { return AtomicRMWKind::minnumf; })
           // Integer operations.
           .Case([](arith::AddIOp op) { return AtomicRMWKind::addi; })
           .Case([](arith::OrIOp op) { return AtomicRMWKind::ori; })
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
index 31e9fd00cffa04..9849f36285b160 100644
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
@@ -407,3 +407,95 @@ module attributes {transform.with_named_sequence} {
       transform.yield
   }
 }
+
+// -----
+// Checks we use nan as the neutral element for maxnumf op.
+func.func @generic_split_maxnumf(%in: tensor<32xf32>, %out: tensor<f32>) -> tensor<f32> {
+  %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                        affine_map<(d0) -> ()>],
+        iterator_types = ["reduction"]}
+  ins(%in : tensor<32xf32>)
+  outs(%out : tensor<f32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %y = arith.maxnumf %arg1, %arg2 : f32
+    linalg.yield %y : f32
+  } -> tensor<f32>
+  return %r : tensor<f32>
+}
+
+//  CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+//  CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)>
+//  CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()>
+// CHECK-LABEL:  func @generic_split_maxnumf
+//  The float value 0xFFC00000 that is filled into the init tensor represents negative NaN.
+//  CHECK-DAG: %[[ID:.*]] = arith.constant 0xFFC00000 : f32
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32>
+//  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]}
+// CHECK-SAME:   ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) {
+//      CHECK:   arith.maxnumf
+//      CHECK:   linalg.yield
+//      CHECK: } -> tensor<4xf32>
+//      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]}
+// CHECK-SAME:   ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor<f32>) {
+//      CHECK:   arith.maxnumf {{.*}}
+//      CHECK:   linalg.yield
+//      CHECK:  } -> tensor<f32>
+//      CHECK: return %[[R]] : tensor<f32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel}
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
+
+// -----
+// Checks we use nan as the neutral element for minnumf op.
+func.func @generic_split_minnumf(%in: tensor<32xf32>, %out: tensor<f32>) -> tensor<f32> {
+  %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                        affine_map<(d0) -> ()>],
+        iterator_types = ["reduction"]}
+  ins(%in : tensor<32xf32>)
+  outs(%out : tensor<f32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %y = arith.minnumf %arg1, %arg2 : f32
+    linalg.yield %y : f32
+  } -> tensor<f32>
+  return %r : tensor<f32>
+}
+
+//  CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+//  CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)>
+//  CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()>
+// CHECK-LABEL:  func @generic_split_minnumf
+//  The float value 0x7FC00000 that is filled into the init tensor represents positive NaN.
+//  CHECK-DAG: %[[ID:.*]] = arith.constant 0x7FC00000 : f32
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32>
+//  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]}
+// CHECK-SAME:   ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) {
+//      CHECK:   arith.minnumf
+//      CHECK:   linalg.yield
+//      CHECK: } -> tensor<4xf32>
+//      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]}
+// CHECK-SAME:   ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor<f32>) {
+//      CHECK:   arith.minnumf {{.*}}
+//      CHECK:   linalg.yield
+//      CHECK:  } -> tensor<f32>
+//      CHECK: return %[[R]] : tensor<f32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel}
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}

From 799316ff26cc82d60f276dc62c4a69b5bba1aef3 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 29 May 2024 14:26:00 +0000
Subject: [PATCH 136/230] [lldb][NFC] Pass Stream& to ToXML methods in
 RegisterFlags

As suggested in a review of some new code for this file, Stream
is more general. The code does not need to know that it's backed
by a string.
---
 lldb/include/lldb/Target/RegisterFlags.h | 6 +++---
 lldb/source/Target/RegisterFlags.cpp     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h
index 9b343e445678ab..29a47540cd4f5b 100644
--- a/lldb/include/lldb/Target/RegisterFlags.h
+++ b/lldb/include/lldb/Target/RegisterFlags.h
@@ -15,7 +15,7 @@
 
 namespace lldb_private {
 
-class StreamString;
+class Stream;
 class Log;
 
 class RegisterFlags {
@@ -56,7 +56,7 @@ class RegisterFlags {
     /// Output XML that describes this field, to be inserted into a target XML
     /// file. Reserved characters in field names like "<" are replaced with
     /// their XML safe equivalents like "&gt;".
-    void ToXML(StreamString &strm) const;
+    void ToXML(Stream &strm) const;
 
     bool operator<(const Field &rhs) const {
       return GetStart() < rhs.GetStart();
@@ -119,7 +119,7 @@ class RegisterFlags {
   std::string AsTable(uint32_t max_width) const;
 
   // Output XML that describes this set of flags.
-  void ToXML(StreamString &strm) const;
+  void ToXML(Stream &strm) const;
 
 private:
   const std::string m_id;
diff --git a/lldb/source/Target/RegisterFlags.cpp b/lldb/source/Target/RegisterFlags.cpp
index b1669b85fd2fe7..5274960587bf37 100644
--- a/lldb/source/Target/RegisterFlags.cpp
+++ b/lldb/source/Target/RegisterFlags.cpp
@@ -190,7 +190,7 @@ std::string RegisterFlags::AsTable(uint32_t max_width) const {
   return table;
 }
 
-void RegisterFlags::ToXML(StreamString &strm) const {
+void RegisterFlags::ToXML(Stream &strm) const {
   // Example XML:
   // <flags id="cpsr_flags" size="4">
   //   <field name="incorrect" start="0" end="0"/>
@@ -213,7 +213,7 @@ void RegisterFlags::ToXML(StreamString &strm) const {
   strm.Indent("</flags>\n");
 }
 
-void RegisterFlags::Field::ToXML(StreamString &strm) const {
+void RegisterFlags::Field::ToXML(Stream &strm) const {
   // Example XML:
   // <field name="correct" start="0" end="0"/>
   strm.Indent();

From 975477e7f7ee1d8c29975224abb452f73b90db36 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 29 May 2024 16:36:52 +0200
Subject: [PATCH 137/230] [CGBuiltin] Explicitly use inbounds GEP (NFCI)

All of these are inbounds as they access known offsets in fixed
globals. NFCI because constant expression construction currently
already infers this, this patch just makes it explicit.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a3c65105033247..266bf41fd5577c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14074,7 +14074,7 @@ Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
   // Grab the appropriate field from __cpu_model.
   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
                          ConstantInt::get(Int32Ty, Index)};
-  llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
+  llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
                                        CharUnits::fromQuantity(4));
 
@@ -14116,7 +14116,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
     // global in the struct STy.
     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
                      Builder.getInt32(0)};
-    Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
+    Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
                                                 CharUnits::fromQuantity(4));
 
@@ -14137,7 +14137,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
       continue;
     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
     Value *Features = Builder.CreateAlignedLoad(
-        Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
+        Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs),
         CharUnits::fromQuantity(4));
     // Check the value of the bit corresponding to the feature requested.
     Value *Mask = Builder.getInt32(M);
@@ -16724,7 +16724,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
                              ConstantInt::get(Int32Ty, FieldIdx)};
 
-      FieldValue = Builder.CreateGEP(STy, SysConf, Idxs);
+      FieldValue = Builder.CreateInBoundsGEP(STy, SysConf, Idxs);
       FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue,
                                              CharUnits::fromQuantity(4));
     } else if (SupportMethod == SYS_CALL) {

From 6127f15e5b4834411e8f2e700e25c40490deec35 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Wed, 29 May 2024 10:53:00 -0400
Subject: [PATCH 138/230] [PowerPC] option `-msoft-float` should not block the
 PC-relative address instruction (#92543)

The Prefix instruction is introduced on PowerPC ISA3_1.

In the PR,
1. The `FeaturePrefixInstrs` do not imply the `FeatureP8Vector`
,`FeatureP9Vector` .
2. `FeaturePrefixInstrs`  implies only the FeatureISA3_1.
3. For the prefix instructions `paddi` and `pli` , they have `Predicates
= [PrefixInstrs] `
4. For the prefix instructions `plfs` and `plfd`, they have `Predicates
= [PrefixInstrs, HasFPU] `
5. For the prefix instructions "plxv` , "plxssp` and `plxsd` , they have
`Predicates = [PrefixInstrs, HasP10Vector]`

Fixes #62372
---
 llvm/lib/Target/PowerPC/PPC.td              |   3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  19 +--
 llvm/lib/Target/PowerPC/PPCInstrP10.td      | 129 ++++++++++++--------
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp |   2 +-
 llvm/test/CodeGen/PowerPC/pr62372.ll        |  13 ++
 5 files changed, 104 insertions(+), 62 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr62372.ll

diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 639771ab9eabbd..84ef582c029d39 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -296,8 +296,7 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
 def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs",
                                            "true",
                                            "Enable prefixed instructions",
-                                           [FeatureISA3_0, FeatureP8Vector,
-                                            FeatureP9Altivec]>;
+                                           [FeatureISA3_1]>;
 def FeaturePCRelativeMemops :
   SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true",
                    "Enable PC relative Memory Ops",
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8450ce9e0e3b3b..a0e91f4dc3a4a7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9460,7 +9460,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // double. This is to exploit the XXSPLTIDP instruction.
   // If we lose precision, we use XXSPLTI32DX.
   if (BVNIsConstantSplat && (SplatBitSize == 64) &&
-      Subtarget.hasPrefixInstrs()) {
+      Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     // Check the type first to short-circuit so we don't modify APSplatBits if
     // this block isn't executed.
     if ((Op->getValueType(0) == MVT::v2f64) &&
@@ -9605,11 +9605,11 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
   // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
   // turned into a 4-byte splat of 0xABABABAB.
-  if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
     return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
                                   Op.getValueType(), DAG, dl);
 
-  if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
     return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
                                   dl);
 
@@ -10242,7 +10242,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
-  if (Subtarget.hasPrefixInstrs()) {
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     SDValue SplatInsertNode;
     if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
       return SplatInsertNode;
@@ -17730,7 +17730,7 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   case MVT::f32:
   case MVT::f64: {
-    if (Subtarget.hasPrefixInstrs()) {
+    if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
       // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
       return true;
     }
@@ -18314,11 +18314,12 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   // Compute subtarget flags.
   if (!Subtarget.hasP9Vector())
     FlagSet |= PPC::MOF_SubtargetBeforeP9;
-  else {
+  else
     FlagSet |= PPC::MOF_SubtargetP9;
-    if (Subtarget.hasPrefixInstrs())
-      FlagSet |= PPC::MOF_SubtargetP10;
-  }
+
+  if (Subtarget.hasPrefixInstrs())
+    FlagSet |= PPC::MOF_SubtargetP10;
+
   if (Subtarget.hasSPE())
     FlagSet |= PPC::MOF_SubtargetSPE;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 5f2937d47a5195..2fd5978a23c80f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -654,13 +654,10 @@ let Predicates = [PrefixInstrs] in {
                                  (ins s34imm:$SI),
                                  "pli $RT, $SI", IIC_IntSimple, []>;
   }
+}
 
+let Predicates = [PrefixInstrs, HasFPU] in {
   let mayLoad = 1, mayStore = 0 in {
-    defm PLXV :
-      8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr),
-                                     (ins (memri34_pcrel $D, $RA):$addr),
-                                     (ins s34imm_pcrel:$D),
-                                     "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>;
     defm PLFS :
       MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$RST), (ins (memri34 $D, $RA):$addr),
                                   (ins (memri34_pcrel $D, $RA):$addr),
@@ -671,6 +668,28 @@ let Predicates = [PrefixInstrs] in {
                                   (ins  (memri34_pcrel $D, $RA):$addr),
                                   (ins s34imm_pcrel:$D), "plfd $RST, $addr",
                                   "plfd $RST, $D", IIC_LdStLFD>;
+  }
+  let mayStore = 1, mayLoad = 0 in {
+    defm PSTFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins f4rc:$RST, s34imm_pcrel:$D),
+                                  "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>;
+    defm PSTFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins f8rc:$RST, s34imm_pcrel:$D),
+                                  "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>;
+  }
+}
+
+let Predicates = [PrefixInstrs, HasP10Vector] in {
+  let mayLoad = 1, mayStore = 0 in {
+    defm PLXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr),
+                                     (ins (memri34_pcrel $D, $RA):$addr),
+                                     (ins s34imm_pcrel:$D),
+                                     "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>;
     defm PLXSSP :
       8LS_DForm_R_SI34_RTA5_MEM_p<43, (outs vfrc:$RST), (ins (memri34 $D, $RA):$addr),
                                   (ins (memri34_pcrel $D, $RA):$addr),
@@ -683,6 +702,28 @@ let Predicates = [PrefixInstrs] in {
                                   (ins s34imm_pcrel:$D),
                                   "plxsd $RST, $addr", "plxsd $RST, $D",
                                   IIC_LdStLFD>;
+  }
+ let mayStore = 1, mayLoad = 0 in {
+    defm PSTXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr),
+                                     (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr),
+                                     (ins vsrc:$XST, s34imm_pcrel:$D),
+                                     "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>;
+    defm PSTXSSP :
+      8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins vfrc:$RST, s34imm_pcrel:$D),
+                                  "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>;
+    defm PSTXSD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins vfrc:$RST, s34imm_pcrel:$D),
+                                  "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>;
+  }
+}
+
+let Predicates = [PrefixInstrs] in {
+  let mayLoad = 1, mayStore = 0 in {
     let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
       defm PLBZ8 :
         MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RST), (ins (memri34 $D, $RA):$addr),
@@ -745,31 +786,6 @@ let Predicates = [PrefixInstrs] in {
   }
 
   let mayStore = 1, mayLoad = 0 in {
-    defm PSTXV :
-      8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr),
-                                     (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr),
-                                     (ins vsrc:$XST, s34imm_pcrel:$D),
-                                     "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>;
-    defm PSTFS :
-      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins f4rc:$RST, s34imm_pcrel:$D),
-                                  "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>;
-    defm PSTFD :
-      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins f8rc:$RST, s34imm_pcrel:$D),
-                                  "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>;
-    defm PSTXSSP :
-      8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins vfrc:$RST, s34imm_pcrel:$D),
-                                  "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>;
-    defm PSTXSD :
-      8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins vfrc:$RST, s34imm_pcrel:$D),
-                                  "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>;
     let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
       defm PSTB8 :
         MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RST, (memri34 $D, $RA):$addr),
@@ -1136,7 +1152,7 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
                                []>;
 }
 
-let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
   defm PLXVP :
     8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins (memri34 $D, $RA):$addr),
                                 (ins (memri34_pcrel $D, $RA):$addr),
@@ -1145,7 +1161,7 @@ let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] i
                                 IIC_LdStLFD>;
 }
 
-let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
   defm PSTXVP :
     8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, (memri34 $D, $RA):$addr),
                                 (ins vsrprc:$XTp, (memri34_pcrel $D, $RA):$addr),
@@ -1157,7 +1173,7 @@ let Predicates = [PairedVectorMemops] in {
   // Intrinsics for Paired Vector Loads.
   def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>;
   def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
     def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>;
   }
   // Intrinsics for Paired Vector Stores.
@@ -1165,7 +1181,7 @@ let Predicates = [PairedVectorMemops] in {
             (STXVP $XSp, memrix16:$dst)>;
   def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst),
             (STXVPX $XSp, XForm:$dst)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
     def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst),
               (PSTXVP $XSp, memri34:$dst)>;
   }
@@ -1236,6 +1252,9 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTDpc $RS, $ga, 0)>;
 
+}
+
+let Predicates = [PCRelativeMemops, HasFPU] in {
   // Load f32
   def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>;
 
@@ -1252,6 +1271,11 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTFDpc $FRS, $ga, 0)>;
 
+  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
+            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
+}
+
+let Predicates = [PCRelativeMemops, HasP10Vector] in {
   // Load f128
   def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))),
             (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
@@ -1288,6 +1312,14 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
+  // Special Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc $src, $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>;
+}
+
+let Predicates = [PCRelativeMemops] in {
   // Atomic Load
   def : Pat<(i32 (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
@@ -1314,15 +1346,6 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(atomic_store_64 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTDpc $RS, $ga, 0)>;
 
-  // Special Cases For PPCstore_scal_int_from_vsr
-  def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc $src, $dst, 0)>;
-  def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>;
-
-  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
-            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
-
   // If the PPCmatpcreladdr node is not caught by any other pattern it should be
   // caught here and turned into a paddi instruction to materialize the address.
   def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
@@ -1335,7 +1358,7 @@ let Predicates = [PCRelativeMemops] in {
             (PADDI8 $in, $addr)>;
 }
 
-let Predicates = [PrefixInstrs] in {
+let Predicates = [PrefixInstrs, HasP10Vector] in {
   def XXPERMX :
     8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
                             vsrc:$XC, u3imm:$IMM),
@@ -2142,7 +2165,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
 class xxevalPattern <dag pattern, bits<8> imm> :
   Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
 
-let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
+let AddedComplexity = 400, Predicates = [PrefixInstrs, HasP10Vector] in {
  def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
                                 i32immNonAllOneNonZero:$A,
                                 i32immNonAllOneNonZero:$A,
@@ -2279,7 +2302,7 @@ def : Pat<(f64 nzFPImmAsi64:$A),
             (PSTXSD (COPY_TO_REGCLASS $src, VFRC), PDForm:$dst)>;
 }
 
-let Predicates = [PrefixInstrs] in {
+let Predicates = [PrefixInstrs, HasP10Vector] in {
   def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>;
   def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>;
   def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
@@ -2300,7 +2323,9 @@ let Predicates = [PrefixInstrs] in {
             (XXBLENDVW $A, $B, $C)>;
   def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
             (XXBLENDVD $A, $B, $C)>;
+}
 
+let Predicates = [PrefixInstrs] in {
   // Anonymous patterns to select prefixed loads and stores.
   // Load i32
   def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
@@ -2335,7 +2360,9 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>;
   def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>;
   def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>;
+}
 
+let Predicates = [PrefixInstrs, HasFPU] in {
   // Load / Store f32
   def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>;
   def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>;
@@ -2345,7 +2372,13 @@ let Predicates = [PrefixInstrs] in {
             (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>;
   def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>;
   def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
+  // Prefixed fpext to v2f64
+  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
+            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
 
+}
+
+let Predicates = [PrefixInstrs] in {
   // Atomic Load
   def : Pat<(i32 (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>;
   def : Pat<(i32 (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>;
@@ -2357,10 +2390,6 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(atomic_store_16 i32:$RS, PDForm:$dst), (PSTH $RS, memri34:$dst)>;
   def : Pat<(atomic_store_32 i32:$RS, PDForm:$dst), (PSTW $RS, memri34:$dst)>;
   def : Pat<(atomic_store_64 i64:$RS, PDForm:$dst), (PSTD $RS, memri34:$dst)>;
-
-  // Prefixed fpext to v2f64
-  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
-            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
 }
 
 def InsertEltShift {
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 7e4cd6c72aa87a..9e8da59615dfb3 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1695,7 +1695,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // transform it to the prefixed version so we don't have to use the XForm.
   if ((OpC == PPC::LXVP || OpC == PPC::STXVP) &&
       (!isInt<16>(Offset) || (Offset % offsetMinAlign(MI)) != 0) &&
-      Subtarget.hasPrefixInstrs()) {
+      Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     unsigned NewOpc = OpC == PPC::LXVP ? PPC::PLXVP : PPC::PSTXVP;
     MI.setDesc(TII.get(NewOpc));
     OpC = NewOpc;
diff --git a/llvm/test/CodeGen/PowerPC/pr62372.ll b/llvm/test/CodeGen/PowerPC/pr62372.ll
new file mode 100644
index 00000000000000..8df236adc92d7b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr62372.ll
@@ -0,0 +1,13 @@
+; RUN: llc -ppc-asm-full-reg-names -mcpu=pwr10 -mtriple powerpc64le-unknown-linux-gnu \
+; RUN: -o - %s | FileCheck %s
+
+@bar = dso_local global i32 0, align 4
+
+define dso_local ptr @foo() #0 {
+entry:
+  ret ptr @bar
+}
+
+attributes #0 = { "use-soft-float"="true" }
+
+; CHECK: paddi r3, 0, bar@PCREL, 1

From cd5045a76a02f87542b2ff3d78352c10aee6395b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 08:04:32 -0700
Subject: [PATCH 139/230] [ValueTypes] Use bit instead of int for boolean
 fields in ValueTypes.td. NFC

---
 llvm/include/llvm/CodeGen/ValueTypes.td | 10 +++++-----
 llvm/utils/TableGen/VTEmitter.cpp       | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index e322cc04c1c769..0d8eaf1b2b1dd0 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -18,11 +18,11 @@ class ValueType<int size, int value> {
   int Value = value;
   int nElem = 1;
   ValueType ElementType = ?;
-  int isOverloaded = false;
-  int isInteger = false;
-  int isFP = false;
-  int isVector = false;
-  int isScalable = false;
+  bit isOverloaded = false;
+  bit isInteger = false;
+  bit isFP = false;
+  bit isVector = false;
+  bit isScalable = false;
 }
 
 class VTAny<int value> : ValueType<0, value> {
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp
index 5ec1f59318f784..9174fe48b62a9d 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/VTEmitter.cpp
@@ -68,10 +68,10 @@ void VTEmitter::run(raw_ostream &OS) {
       continue;
     auto Name = VT->getValueAsString("LLVMName");
     auto Value = VT->getValueAsInt("Value");
-    bool IsInteger = VT->getValueAsInt("isInteger");
-    bool IsFP = VT->getValueAsInt("isFP");
-    bool IsVector = VT->getValueAsInt("isVector");
-    bool IsScalable = VT->getValueAsInt("isScalable");
+    bool IsInteger = VT->getValueAsBit("isInteger");
+    bool IsFP = VT->getValueAsBit("isFP");
+    bool IsVector = VT->getValueAsBit("isVector");
+    bool IsScalable = VT->getValueAsBit("isScalable");
 
     UpdateVTRange("INTEGER_FIXEDLEN_VECTOR_VALUETYPE", Name,
                   IsInteger && IsVector && !IsScalable);
@@ -92,7 +92,7 @@ void VTEmitter::run(raw_ostream &OS) {
        << Name << ", "
        << Value << ", "
        << VT->getValueAsInt("Size") << ", "
-       << VT->getValueAsInt("isOverloaded") << ", "
+       << VT->getValueAsBit("isOverloaded") << ", "
        << (IsInteger ? Name[0] == 'i' ? 3 : 1 : 0) << ", "
        << (IsFP ? Name[0] == 'f' ? 3 : 1 : 0) << ", "
        << IsVector << ", "
@@ -111,14 +111,14 @@ void VTEmitter::run(raw_ostream &OS) {
 
   OS << "#ifdef GET_VT_VECATTR // (Ty, Sc, nElem, ElTy, ElSz)\n";
   for (const auto *VT : VTsByNumber) {
-    if (!VT || !VT->getValueAsInt("isVector"))
+    if (!VT || !VT->getValueAsBit("isVector"))
       continue;
     const auto *ElTy = VT->getValueAsDef("ElementType");
     assert(ElTy);
     // clang-format off
     OS << "  GET_VT_VECATTR("
        << VT->getValueAsString("LLVMName") << ", "
-       << VT->getValueAsInt("isScalable") << ", "
+       << VT->getValueAsBit("isScalable") << ", "
        << VT->getValueAsInt("nElem") << ", "
        << ElTy->getName() << ", "
        << ElTy->getValueAsInt("Size") << ")\n";

From b15a0a37404f36bcd9c7995de8cd16f9cb5ac8af Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Wed, 29 May 2024 11:16:18 -0400
Subject: [PATCH 140/230] [clang] Add tanf16 builtin and support for tan
 constrained intrinsic (#93314)

In LLVM, the `llvm.experimental.constrained.cos` and
`llvm.experimental.constrained.sin` intrinsics are used for performing
cosine and sine calculations with additional constraints on
floating-point operations. This behavior is expected for all
floating-point math intrinsics. This change adds these constraints for
the `tan` intrinsic.

-  `Builtins.td` - replace TanF128 with F16F128MathTemplate
- `CGBuiltin.cpp` - map existing tan builtins to `tan` and
`constrained_tan` intrinsic
-   `ConstrainedOps.def` map tan and constrained_tan  to an ISDOpcode.
-  `ISDOpcodes.h` - define tan and strict tan  opcodes

resolves  #91421
---
 clang/include/clang/Basic/Builtins.td         |  6 ++--
 clang/lib/CodeGen/CGBuiltin.cpp               | 12 +++++++
 clang/test/CodeGen/X86/math-builtins.c        |  8 ++---
 .../test/CodeGen/constrained-math-builtins.c  | 13 +++++++
 clang/test/CodeGen/math-libcalls.c            | 12 +++----
 clang/test/CodeGenOpenCL/builtins-f16.cl      |  3 ++
 llvm/docs/LangRef.rst                         | 36 +++++++++++++++++++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  2 ++
 llvm/include/llvm/IR/ConstrainedOps.def       |  1 +
 llvm/include/llvm/IR/Intrinsics.td            |  4 +++
 llvm/test/Assembler/fp-intrinsics-attr.ll     |  8 +++++
 llvm/test/Feature/fp-intrinsics.ll            | 11 ++++++
 12 files changed, 103 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 11982af3fa609b..7bef5fd7ad40f2 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -482,11 +482,11 @@ def SqrtF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def TanF128 : Builtin {
-  let Spellings = ["__builtin_tanf128"];
+def TanF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_tan"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def TanhF128 : Builtin {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 266bf41fd5577c..94a7036f6233cc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2923,6 +2923,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       SetSqrtFPAccuracy(Call);
       return RValue::get(Call);
     }
+
+    case Builtin::BItan:
+    case Builtin::BItanf:
+    case Builtin::BItanl:
+    case Builtin::BI__builtin_tan:
+    case Builtin::BI__builtin_tanf:
+    case Builtin::BI__builtin_tanf16:
+    case Builtin::BI__builtin_tanl:
+    case Builtin::BI__builtin_tanf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan));
+
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 093239b4482609..1e0f129b986102 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -674,10 +674,10 @@ __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_
 
 __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
 
-// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @tanf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.tan.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index 2de832dd2b6cae..6cc3a10a1e7946 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -183,6 +183,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.tan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.tan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
+
   __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.trunc.f64(double %{{.*}}, metadata !"fpexcept.strict")
@@ -315,6 +323,11 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata)
 // CHECK: declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 
+// CHECK: declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+// CHECK: declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+// CHECK: declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
+// CHECK: declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
+
 // CHECK: declare double @llvm.experimental.constrained.trunc.f64(double, metadata)
 // CHECK: declare float @llvm.experimental.constrained.trunc.f32(float, metadata)
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 29c312ba0ecac2..a249182692762d 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -662,15 +662,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   tan(f);        tanf(f);       tanl(f);
 
-// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @tan(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @tanf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tan.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tan.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tan.f80(
 
   tanh(f);       tanhf(f);      tanhl(f);
 
diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl
index adf7cdde154f51..d7bffdad5c548f 100644
--- a/clang/test/CodeGenOpenCL/builtins-f16.cl
+++ b/clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -66,6 +66,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.sqrt.f16(half %h0)
   res = __builtin_sqrtf16(h0);
 
+  // CHECK: call half @llvm.tan.f16(half %h0)
+  res = __builtin_tanf16(h0);
+
   // CHECK: call half @llvm.trunc.f16(half %h0)
   res = __builtin_truncf16(h0);
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7b64c477d13c7f..a650692d44d76e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -26229,6 +26229,42 @@ same values as the libm ``cos`` functions would, and handles error
 conditions in the same way.
 
 
+'``llvm.experimental.constrained.tan``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.tan(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.tan``' intrinsic returns the tangent of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the tangent of the specified operand, returning the
+same values as the libm ``tan`` functions would, and handles error
+conditions in the same way.
+
+
 '``llvm.experimental.constrained.exp``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d8af97957e48ec..22062f0efbbda1 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -415,6 +415,7 @@ enum NodeType {
   STRICT_FLDEXP,
   STRICT_FSIN,
   STRICT_FCOS,
+  STRICT_FTAN,
   STRICT_FEXP,
   STRICT_FEXP2,
   STRICT_FLOG,
@@ -934,6 +935,7 @@ enum NodeType {
   FCBRT,
   FSIN,
   FCOS,
+  FTAN,
   FPOW,
   FPOWI,
   /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index 41aa44de957f93..a7b37c5cb204da 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -95,6 +95,7 @@ DAG_FUNCTION(round,           1, 0, experimental_constrained_round,      FROUND)
 DAG_FUNCTION(roundeven,       1, 0, experimental_constrained_roundeven,  FROUNDEVEN)
 DAG_FUNCTION(sin,             1, 1, experimental_constrained_sin,        FSIN)
 DAG_FUNCTION(sqrt,            1, 1, experimental_constrained_sqrt,       FSQRT)
+DAG_FUNCTION(tan,             1, 1, experimental_constrained_tan,        FTAN)
 DAG_FUNCTION(trunc,           1, 0, experimental_constrained_trunc,      FTRUNC)
 
 // This is definition for fmuladd intrinsic function, that is converted into
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 107442623ab7bd..4c506a6ace23ea 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1218,6 +1218,10 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_tan  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
   def int_experimental_constrained_pow  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll
index 6546d1a275c99f..613630e1a2b4d2 100644
--- a/llvm/test/Assembler/fp-intrinsics-attr.ll
+++ b/llvm/test/Assembler/fp-intrinsics-attr.ll
@@ -85,6 +85,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp {
                                                metadata !"round.dynamic",
                                                metadata !"fpexcept.strict")
 
+  %tan = call double @llvm.experimental.constrained.tan.f64(
+                                               double %a,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+
   %pow = call double @llvm.experimental.constrained.pow.f64(
                                                double %a, double %b,
                                                metadata !"round.dynamic",
@@ -244,6 +249,9 @@ declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.cos.f64({{.*}}) #[[ATTR1]]
 
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+; CHECK: @llvm.experimental.constrained.tan.f64({{.*}}) #[[ATTR1]]
+
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.pow.f64({{.*}}) #[[ATTR1]]
 
diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll
index b92408a1bf1cd5..7759813dc2e114 100644
--- a/llvm/test/Feature/fp-intrinsics.ll
+++ b/llvm/test/Feature/fp-intrinsics.ll
@@ -151,6 +151,17 @@ entry:
   ret double %result
 }
 
+; Verify that tan(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: ftan
+; CHECK: call double @llvm.experimental.constrained.tan
+define double @ftan() #0 {
+entry:
+  %result = call double @llvm.experimental.constrained.tan.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret double %result
+}
+
 ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f10
 ; CHECK: call double @llvm.experimental.constrained.exp

From fe82a3da36196157c0caa1ef2505186782f750d1 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 29 May 2024 16:16:08 +0100
Subject: [PATCH 141/230] Revert "[Support] Remove terminfo dependency
 (#92865)"

This reverts commit 6bf450c7a60fa62c642e39836566da94bb9bbc91.

It breaks LLDB CI: https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/as-lldb-cmake/4762/execution/node/97/log/

```
/Applications/Xcode-beta.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++ -Wdocumentation -fPIC -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -Wno-deprecated-declarations -Wno-unknown-pragmas -Wno-strict-aliasing -Wno-deprecated-register -Wno-vla-extension -O3 -DNDEBUG -arch arm64 -isysroot /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk -mmacosx-version-min=14.1 -Wl,-search_paths_first -Wl,-headerpad_max_install_names -Wl,-dead_strip -Wl,-no_warn_duplicate_libraries tools/lldb/unittests/Editline/CMakeFiles/EditlineTests.dir/EditlineTest.cpp.o -o tools/lldb/unittests/Editline/EditlineTests  lib/libLLVMSupport.a  lib/libllvm_gtest_main.a  lib/libllvm_gtest.a  lib/liblldbHost.a  lib/liblldbUtility.a  lib/libLLVMTestingSupport.a  /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libxml2.tbd  /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libedit.tbd  lib/liblldbHostMacOSXObjCXX.a  lib/liblldbUtility.a  -framework Foundation  -framework CoreFoundation  -framework CoreServices  -framework Security  lib/libLLVMObject.a  lib/libLLVMIRReader.a  lib/libLLVMBitReader.a  lib/libLLVMAsmParser.a  lib/libLLVMCore.a  lib/libLLVMRemarks.a  lib/libLLVMBitstreamReader.a  lib/libLLVMMCParser.a  lib/libLLVMMC.a  lib/libLLVMDebugInfoCodeView.a  lib/libLLVMTextAPI.a  lib/libLLVMBinaryFormat.a  lib/libLLVMTargetParser.a  lib/libllvm_gtest.a  lib/libLLVMSupport.a  -lm  /Applications/Xcode-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.2.sdk/usr/lib/libz.tbd  /opt/homebrew/lib/libzstd.dylib  lib/libLLVMDemangle.a  -lpthread && cd /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/lldb-build/tools/lldb/unittests/Editline && /opt/homebrew/Cellar/cmake/3.28.3/bin/cmake -E make_directory /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/lldb-build/tools/lldb/unittests/Editline/./Inputs
ld: Undefined symbols:
  _setupterm, referenced from:
      lldb_private::Editline::Editline(char const*, __sFILE*, __sFILE*, __sFILE*, std::__1::recursive_mutex&) in liblldbHost.a[35](Editline.cpp.o)
clang: error: linker command failed with exit code 1 (use -v to see invocation)
```
---
 clang/cmake/caches/Fuchsia-stage2.cmake       |  1 +
 clang/cmake/caches/Fuchsia.cmake              |  7 +++
 clang/cmake/caches/VectorEngine.cmake         |  4 +-
 clang/utils/analyzer/entrypoint.py            |  2 +-
 compiler-rt/cmake/config-ix.cmake             | 15 +++++
 .../symbolizer/scripts/build_symbolizer.sh    |  1 +
 compiler-rt/lib/xray/tests/CMakeLists.txt     |  5 ++
 lldb/docs/resources/build.rst                 |  1 +
 lldb/source/Core/CMakeLists.txt               |  3 +
 llvm/CMakeLists.txt                           |  2 +
 llvm/cmake/config-ix.cmake                    | 10 ++++
 llvm/cmake/modules/FindTerminfo.cmake         | 55 +++++++++++++++++
 llvm/cmake/modules/LLVMConfig.cmake.in        |  5 ++
 llvm/docs/ReleaseNotes.rst                    |  4 --
 llvm/include/llvm/Config/config.h.cmake       |  3 +
 llvm/lib/Support/CMakeLists.txt               | 11 ++++
 llvm/lib/Support/Unix/Process.inc             | 60 +++++++++++++++++--
 llvm/utils/gn/README.rst                      |  2 +-
 llvm/utils/gn/build/libs/terminfo/BUILD.gn    | 12 ++++
 llvm/utils/gn/build/libs/terminfo/enable.gni  |  4 ++
 .../llvm/include/llvm/Config/BUILD.gn         |  7 +++
 .../gn/secondary/llvm/lib/Support/BUILD.gn    |  1 +
 .../secondary/llvm/tools/llvm-config/BUILD.gn |  6 +-
 utils/bazel/.bazelrc                          |  3 +
 .../llvm/include/llvm/Config/config.h         |  3 +
 utils/bazel/llvm_configs/config.h.cmake       |  3 +
 26 files changed, 218 insertions(+), 12 deletions(-)
 create mode 100644 llvm/cmake/modules/FindTerminfo.cmake
 create mode 100644 llvm/utils/gn/build/libs/terminfo/BUILD.gn
 create mode 100644 llvm/utils/gn/build/libs/terminfo/enable.gni

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 66e764968e85ce..d5546e20873b3c 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -19,6 +19,7 @@ set(LLVM_ENABLE_LLD ON CACHE BOOL "")
 set(LLVM_ENABLE_LTO ON CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_PLUGINS OFF CACHE BOOL "")
+set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB ON CACHE BOOL "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 4d3af3ad3f4031..30a3b9116a461f 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -12,6 +12,7 @@ set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBXML2 OFF CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
@@ -33,6 +34,7 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   LibXml2_ROOT
   LLVM_ENABLE_CURL
   LLVM_ENABLE_HTTPLIB
+  LLVM_ENABLE_TERMINFO
   LLVM_ENABLE_LIBEDIT
   CURL_ROOT
   OpenSSL_ROOT
@@ -45,6 +47,11 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   CURSES_LIBRARIES
   PANEL_LIBRARIES
 
+  # Deprecated
+  Terminfo_ROOT
+
+  Terminfo_LIBRARIES
+
   # Deprecated
   LibEdit_ROOT
 
diff --git a/clang/cmake/caches/VectorEngine.cmake b/clang/cmake/caches/VectorEngine.cmake
index b429fb0997d7a0..2f968a21cc407e 100644
--- a/clang/cmake/caches/VectorEngine.cmake
+++ b/clang/cmake/caches/VectorEngine.cmake
@@ -13,7 +13,9 @@
 #   ninja
 #
 
-# Disable ZLIB, and ZSTD for VE since there is no pre-compiled libraries.
+# Disable TERMINFO, ZLIB, and ZSTD for VE since there is no pre-compiled
+# libraries.
+set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZSTD OFF CACHE BOOL "")
 
diff --git a/clang/utils/analyzer/entrypoint.py b/clang/utils/analyzer/entrypoint.py
index 4deb42db0a0b1f..ff877060bad69e 100644
--- a/clang/utils/analyzer/entrypoint.py
+++ b/clang/utils/analyzer/entrypoint.py
@@ -54,7 +54,7 @@ def is_cmake_needed():
     "cmake -G Ninja -DCMAKE_BUILD_TYPE=Release "
     "-DCMAKE_INSTALL_PREFIX=/analyzer -DLLVM_TARGETS_TO_BUILD=X86 "
     '-DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_BUILD_RUNTIME=OFF '
-    "-DCLANG_ENABLE_ARCMT=OFF "
+    "-DLLVM_ENABLE_TERMINFO=OFF -DCLANG_ENABLE_ARCMT=OFF "
     "-DCLANG_ENABLE_STATIC_ANALYZER=ON"
 )
 
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index bddaa37579fd7b..42edbe15edafb5 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -182,6 +182,21 @@ check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
 check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD)
 check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO)
 
+# Look for terminfo library, used in unittests that depend on LLVMSupport.
+if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
+  set(MAYBE_REQUIRED REQUIRED)
+else()
+  set(MAYBE_REQUIRED)
+endif()
+if(LLVM_ENABLE_TERMINFO)
+  find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED})
+endif()
+if(COMPILER_RT_TERMINFO_LIB)
+  set(LLVM_ENABLE_TERMINFO 1)
+else()
+  set(LLVM_ENABLE_TERMINFO 0)
+endif()
+
 if (ANDROID AND COMPILER_RT_HAS_LIBDL)
   # Android's libstdc++ has a dependency on libdl.
   list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index b4702339db59cc..005bd6d584c593 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -139,6 +139,7 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
     -DLLVM_INCLUDE_TESTS=OFF \
     -DLLVM_ENABLE_ZLIB=ON \
     -DLLVM_ENABLE_ZSTD=OFF \
+    -DLLVM_ENABLE_TERMINFO=OFF \
     -DLLVM_ENABLE_THREADS=OFF \
   $LLVM_SRC
 fi
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index 4c7e92b6ecc3d2..0a428b9a30b18b 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -54,6 +54,11 @@ set(XRAY_UNITTEST_LINK_FLAGS
   ${COMPILER_RT_CXX_LINK_LIBS})
 
 if (NOT APPLE)
+  # Needed by LLVMSupport.
+  append_list_if(
+    LLVM_ENABLE_TERMINFO
+    -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS)
+
   # We add the library directories one at a time in our CFLAGS.
   foreach (DIR ${LLVM_LIBRARY_DIR})
     list(APPEND XRAY_UNITTEST_LINK_FLAGS -L${DIR})
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 33b6a6f79def4b..09d3d15a940836 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -477,6 +477,7 @@ further by passing the appropriate cmake options, such as:
   -DLLDB_ENABLE_PYTHON=0
   -DLLDB_ENABLE_LIBEDIT=0
   -DLLDB_ENABLE_CURSES=0
+  -DLLVM_ENABLE_TERMINFO=0
 
 (see :ref:`Optional Dependencies` for more)
 
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index dbc620b91b1ed1..471fd9c1a33e59 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -11,6 +11,9 @@ set(LLDB_LIBEDIT_LIBS)
 
 if (LLDB_ENABLE_CURSES)
   list(APPEND LLDB_CURSES_LIBS ${PANEL_LIBRARIES} ${CURSES_LIBRARIES})
+  if(LLVM_ENABLE_TERMINFO)
+    list(APPEND LLDB_CURSES_LIBS ${Terminfo_LIBRARIES})
+  endif()
   if (LLVM_BUILD_STATIC)
     list(APPEND LLDB_CURSES_LIBS gpm)
   endif()
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 64898ab09772f4..cbf4db60a6e185 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -539,6 +539,8 @@ set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should sear
 set(LLVM_TARGET_ARCH "host"
   CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")
 
+option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
+
 set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON")
 
 option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 0aae13e30f2ab4..7d2a49337e1e86 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -240,11 +240,21 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
     else()
       set(HAVE_LIBEDIT 0)
     endif()
+    if(LLVM_ENABLE_TERMINFO)
+      if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
+        find_package(Terminfo REQUIRED)
+      else()
+        find_package(Terminfo)
+      endif()
+      set(LLVM_ENABLE_TERMINFO "${Terminfo_FOUND}")
+    endif()
   else()
     set(HAVE_LIBEDIT 0)
+    set(LLVM_ENABLE_TERMINFO 0)
   endif()
 else()
   set(HAVE_LIBEDIT 0)
+  set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
 if(LLVM_HAS_LOGF128)
diff --git a/llvm/cmake/modules/FindTerminfo.cmake b/llvm/cmake/modules/FindTerminfo.cmake
new file mode 100644
index 00000000000000..163af669706771
--- /dev/null
+++ b/llvm/cmake/modules/FindTerminfo.cmake
@@ -0,0 +1,55 @@
+# Attempts to discover terminfo library with a linkable setupterm function.
+#
+# Example usage:
+#
+# find_package(Terminfo)
+#
+# If successful, the following variables will be defined:
+# Terminfo_FOUND
+# Terminfo_LIBRARIES
+#
+# Additionally, the following import target will be defined:
+# Terminfo::terminfo
+
+find_library(Terminfo_LIBRARIES NAMES terminfo tinfo curses ncurses ncursesw)
+
+if(Terminfo_LIBRARIES)
+  include(CMakePushCheckState)
+  cmake_push_check_state()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES ${Terminfo_LIBRARIES})
+  set(Terminfo_LINKABLE_SRC [=[
+    #ifdef __cplusplus
+    extern "C" {
+    #endif
+    int setupterm(char *term, int filedes, int *errret);
+    #ifdef __cplusplus
+    }
+    #endif
+    int main(void) { return setupterm(0, 0, 0); }
+    ]=])
+  if(DEFINED CMAKE_C_COMPILER)
+    include(CheckCSourceCompiles)
+    check_c_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
+  else()
+    include(CheckCXXSourceCompiles)
+    check_cxx_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
+  endif()
+  cmake_pop_check_state()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Terminfo
+                                  FOUND_VAR
+                                    Terminfo_FOUND
+                                  REQUIRED_VARS
+                                    Terminfo_LIBRARIES
+                                    Terminfo_LINKABLE)
+mark_as_advanced(Terminfo_LIBRARIES
+                 Terminfo_LINKABLE)
+
+if(Terminfo_FOUND)
+  if(NOT TARGET Terminfo::terminfo)
+    add_library(Terminfo::terminfo UNKNOWN IMPORTED)
+    set_target_properties(Terminfo::terminfo PROPERTIES IMPORTED_LOCATION "${Terminfo_LIBRARIES}")
+  endif()
+endif()
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 7e1501a89354c8..397bd5815b64e9 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -60,6 +60,11 @@ if(LLVM_ENABLE_LIBEDIT)
   find_package(LibEdit)
 endif()
 
+set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)
+if(LLVM_ENABLE_TERMINFO)
+  find_package(Terminfo)
+endif()
+
 set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
 
 set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@)
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c7c2c2825f58b9..1e1ccb495c3669 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -63,10 +63,6 @@ Changes to LLVM infrastructure
 Changes to building LLVM
 ------------------------
 
-- The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on
-  terminfo and now always uses the ``TERM`` environment variable for color
-  support autodetection.
-
 Changes to TableGen
 -------------------
 
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index ff30741c8f360a..977c182e9d2b0d 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -209,6 +209,9 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
+/* Define if the setupterm() function is supported this platform. */
+#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
+
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index be4badc09efa58..03e888958a0711 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -56,6 +56,9 @@ elseif( CMAKE_HOST_UNIX )
     STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE})
     set(system_libs ${system_libs} ${Backtrace_LIBFILE})
   endif()
+  if( LLVM_ENABLE_TERMINFO )
+    set(imported_libs ${imported_libs} Terminfo::terminfo)
+  endif()
   set(system_libs ${system_libs} ${LLVM_ATOMIC_LIB})
   set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB})
   if( UNIX AND NOT (BEOS OR HAIKU) )
@@ -322,6 +325,14 @@ if(LLVM_ENABLE_ZSTD)
   set(llvm_system_libs ${llvm_system_libs} "${zstd_library}")
 endif()
 
+if(LLVM_ENABLE_TERMINFO)
+  if(NOT terminfo_library)
+    get_property(terminfo_library TARGET Terminfo::terminfo PROPERTY LOCATION)
+  endif()
+  get_library_name(${terminfo_library} terminfo_library)
+  set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}")
+endif()
+
 set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}")
 
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index 84b10ff5d1d08a..ae90924cae1b9b 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -341,9 +341,17 @@ unsigned Process::StandardErrColumns() {
   return getColumns();
 }
 
-static bool terminalHasColors() {
-  // Check if the current terminal is one of terminals that are known to support
-  // ANSI color escape codes.
+#ifdef LLVM_ENABLE_TERMINFO
+// We manually declare these extern functions because finding the correct
+// headers from various terminfo, curses, or other sources is harder than
+// writing their specs down.
+extern "C" int setupterm(char *term, int filedes, int *errret);
+extern "C" struct term *set_curterm(struct term *termp);
+extern "C" int del_curterm(struct term *termp);
+extern "C" int tigetnum(char *capname);
+#endif
+
+bool checkTerminalEnvironmentForColors() {
   if (const char *TermStr = std::getenv("TERM")) {
     return StringSwitch<bool>(TermStr)
         .Case("ansi", true)
@@ -360,10 +368,54 @@ static bool terminalHasColors() {
   return false;
 }
 
+static bool terminalHasColors(int fd) {
+#ifdef LLVM_ENABLE_TERMINFO
+  // First, acquire a global lock because these C routines are thread hostile.
+  static std::mutex TermColorMutex;
+  std::lock_guard<std::mutex> G(TermColorMutex);
+
+  struct term *previous_term = set_curterm(nullptr);
+  int errret = 0;
+  if (setupterm(nullptr, fd, &errret) != 0)
+    // Regardless of why, if we can't get terminfo, we shouldn't try to print
+    // colors.
+    return false;
+
+  // Test whether the terminal as set up supports color output. How to do this
+  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
+  // would be nice to avoid a dependency on curses proper when we can make do
+  // with a minimal terminfo parsing library. Also, we don't really care whether
+  // the terminal supports the curses-specific color changing routines, merely
+  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
+  // strategy here is just to query the baseline colors capability and if it
+  // supports colors at all to assume it will translate the escape codes into
+  // whatever range of colors it does support. We can add more detailed tests
+  // here if users report them as necessary.
+  //
+  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
+  // the terminfo says that no colors are supported.
+  int colors_ti = tigetnum(const_cast<char *>("colors"));
+  bool HasColors =
+      colors_ti >= 0 ? colors_ti : checkTerminalEnvironmentForColors();
+
+  // Now extract the structure allocated by setupterm and free its memory
+  // through a really silly dance.
+  struct term *termp = set_curterm(previous_term);
+  (void)del_curterm(termp); // Drop any errors here.
+
+  // Return true if we found a color capabilities for the current terminal.
+  return HasColors;
+#else
+  // When the terminfo database is not available, check if the current terminal
+  // is one of terminals that are known to support ANSI color escape codes.
+  return checkTerminalEnvironmentForColors();
+#endif
+}
+
 bool Process::FileDescriptorHasColors(int fd) {
   // A file descriptor has colors if it is displayed and the terminal has
   // colors.
-  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors(fd);
 }
 
 bool Process::StandardOutHasColors() {
diff --git a/llvm/utils/gn/README.rst b/llvm/utils/gn/README.rst
index 52d03be533e55e..9ca545061099d8 100644
--- a/llvm/utils/gn/README.rst
+++ b/llvm/utils/gn/README.rst
@@ -131,7 +131,7 @@ configure is used for three classes of feature checks:
 
 For the last two points, it would be nice if LLVM didn't have a single
 ``config.h`` header, but one header per toggle. That way, when e.g.
-``llvm_enable_zlib`` is toggled, only the 3 files caring about that setting
+``llvm_enable_terminfo`` is toggled, only the 3 files caring about that setting
 would need to be rebuilt, instead of everything including ``config.h``.
 
 GN doesn't believe in users setting arbitrary cflags from an environment
diff --git a/llvm/utils/gn/build/libs/terminfo/BUILD.gn b/llvm/utils/gn/build/libs/terminfo/BUILD.gn
new file mode 100644
index 00000000000000..10003d61c4df91
--- /dev/null
+++ b/llvm/utils/gn/build/libs/terminfo/BUILD.gn
@@ -0,0 +1,12 @@
+import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
+
+config("terminfo_config") {
+  visibility = [ ":terminfo" ]
+  libs = [ "ncurses" ]
+}
+
+group("terminfo") {
+  if (llvm_enable_terminfo) {
+    public_configs = [ ":terminfo_config" ]
+  }
+}
diff --git a/llvm/utils/gn/build/libs/terminfo/enable.gni b/llvm/utils/gn/build/libs/terminfo/enable.gni
new file mode 100644
index 00000000000000..79ea2b601857ff
--- /dev/null
+++ b/llvm/utils/gn/build/libs/terminfo/enable.gni
@@ -0,0 +1,4 @@
+declare_args() {
+  # Whether to link against terminfo.
+  llvm_enable_terminfo = false
+}
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index d8266fee05014b..2da26d102e7723 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -10,6 +10,7 @@ import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/curl/enable.gni")
 import("//llvm/utils/gn/build/libs/edit/enable.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
+import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xar/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
@@ -293,6 +294,12 @@ write_cmake_config("config") {
     values += [ "HAVE_LIBEDIT=" ]
   }
 
+  if (llvm_enable_terminfo) {
+    values += [ "LLVM_ENABLE_TERMINFO=1" ]
+  } else {
+    values += [ "LLVM_ENABLE_TERMINFO=" ]
+  }
+
   if (llvm_enable_libxml2) {
     values += [ "LLVM_ENABLE_LIBXML2=1" ]
   } else {
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 7728455499bf3d..941d448b3367c1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -6,6 +6,7 @@ static_library("Support") {
     "//llvm/include/llvm/Support:write_vcsrevision",
     "//llvm/lib/Demangle",
     "//llvm/utils/gn/build/libs/pthread",
+    "//llvm/utils/gn/build/libs/terminfo",
     "//llvm/utils/gn/build/libs/zlib",
   ]
 
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
index 711e4e3b431511..bf50cd0fce46bd 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
@@ -1,6 +1,7 @@
 import("//llvm/lib/Target/targets_string.gni")
 import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
+import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
 import("//llvm/utils/gn/build/write_cmake_config.gni")
@@ -35,7 +36,7 @@ write_cmake_config("BuildVariables.inc") {
     lib = ""
   }
 
-  # Windows doesn't use any of libxml2, zlib by default.
+  # Windows doesn't use any of libxml2, terminfo, zlib by default.
   # Make GN not warn about these variables being unused.
   not_needed([
                "l",
@@ -62,6 +63,9 @@ write_cmake_config("BuildVariables.inc") {
   if (llvm_enable_libxml2) {
     system_libs += " ${l}xml2${lib}"
   }
+  if (llvm_enable_terminfo) {
+    system_libs += " ${l}ncurses${lib}"
+  }
   if (llvm_enable_zlib) {
     system_libs += " ${l}z${lib}"
   }
diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc
index 09111bcdc834ec..5a6d1889076afa 100644
--- a/utils/bazel/.bazelrc
+++ b/utils/bazel/.bazelrc
@@ -51,6 +51,9 @@ build --experimental_cc_shared_library
 build:zlib_external --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=external
 build:zlib_system --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=system
 
+build:terminfo_external --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=external
+build:terminfo_system --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=system
+
 ###############################################################################
 # Options for "generic_clang" builds: these options should generally apply to
 # builds using a Clang-based compiler, and default to the `clang` executable on
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index a4fb47d677ab15..e9385f45c5e5cd 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -222,6 +222,9 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
+/* Define if the setupterm() function is supported this platform. */
+/* LLVM_ENABLE_TERMINFO defined in Bazel */
+
 /* Define to 1 if you have the <termios.h> header file. */
 #define HAVE_TERMIOS_H 1
 
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index ff30741c8f360a..977c182e9d2b0d 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -209,6 +209,9 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
+/* Define if the setupterm() function is supported this platform. */
+#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
+
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 

From 1de6011c34b185235cd65c2e3fb030015d182968 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 08:35:32 -0700
Subject: [PATCH 142/230] [ValueTypes] Remove hardcoded 224 from VTEmitter.cpp.
 NFC

Add a new bit to ValueTypes.td to indicate whether a type should be
part of the [FIRST_VALUETYPE,LAST_VALUETYPE] range or not.

This was reviewed as part of #93654.
---
 llvm/include/llvm/CodeGen/ValueTypes.td | 6 ++++++
 llvm/utils/TableGen/VTEmitter.cpp       | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 0d8eaf1b2b1dd0..a6981b0ffa13c2 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -23,6 +23,9 @@ class ValueType<int size, int value> {
   bit isFP = false;
   bit isVector = false;
   bit isScalable = false;
+  // Indicates this VT should be included in the
+  // [FIRST_VALUETYPE,LAST_VALUETYPE] range.
+  bit isNormalValueType = true;
 }
 
 class VTAny<int value> : ValueType<0, value> {
@@ -287,6 +290,7 @@ def aarch64svcount
               : ValueType<16,  199>;  // AArch64 predicate-as-counter
 def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
+let isNormalValueType = false in {
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
   let LLVMName = "Metadata";
@@ -316,6 +320,8 @@ def iPTR       : ValueType<0, 254>;
 // Should only be used in TableGen.
 def Any        : VTAny<255>;
 
+} // isNormalValueType = false
+
 } // end defset ValueTypes
 
 /// This class is for targets that want to use pointer types in patterns
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp
index 9174fe48b62a9d..64b54ed134232c 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/VTEmitter.cpp
@@ -72,6 +72,7 @@ void VTEmitter::run(raw_ostream &OS) {
     bool IsFP = VT->getValueAsBit("isFP");
     bool IsVector = VT->getValueAsBit("isVector");
     bool IsScalable = VT->getValueAsBit("isScalable");
+    bool IsNormalValueType =  VT->getValueAsBit("isNormalValueType");
 
     UpdateVTRange("INTEGER_FIXEDLEN_VECTOR_VALUETYPE", Name,
                   IsInteger && IsVector && !IsScalable);
@@ -85,7 +86,7 @@ void VTEmitter::run(raw_ostream &OS) {
     UpdateVTRange("VECTOR_VALUETYPE", Name, IsVector);
     UpdateVTRange("INTEGER_VALUETYPE", Name, IsInteger && !IsVector);
     UpdateVTRange("FP_VALUETYPE", Name, IsFP && !IsVector);
-    UpdateVTRange("VALUETYPE", Name, Value < 224);
+    UpdateVTRange("VALUETYPE", Name, IsNormalValueType);
 
     // clang-format off
     OS << "  GET_VT_ATTR("

From 6d90ac1e06f31cae9806a8815158e2851cf8e987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett@gmail.com>
Date: Wed, 29 May 2024 18:05:33 +0200
Subject: [PATCH 143/230] [GlobalIsel] Combine freeze (#93239)

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |    4 -
 .../include/llvm/Target/GlobalISel/Combine.td |   24 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |   12 +-
 .../GlobalISel/CombinerHelperVectorOps.cpp    |   53 -
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         |  102 +-
 .../GlobalISel/combine-extract-vec-elt.mir    |    4 +-
 .../AArch64/GlobalISel/combine-freeze.mir     | 1154 +++++++++++++++++
 .../GlobalISel/combine-insert-vec-elt.mir     |    6 +-
 ...galizer-combiner-divrem-insertpt-crash.mir |    3 +-
 llvm/test/CodeGen/AArch64/fast-isel-select.ll |  594 +++++++--
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |  490 +++----
 11 files changed, 1971 insertions(+), 475 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 2111e82e1a99d2..2ddf20ebe7af72 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -840,10 +840,6 @@ class CombinerHelper {
   /// Combine extract vector element.
   bool matchExtractVectorElement(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine extract vector element with freeze on the vector register.
-  bool matchExtractVectorElementWithFreeze(const MachineOperand &MO,
-                                           BuildFnTy &MatchInfo);
-
   /// Combine extract vector element with a build vector on the vector register.
   bool matchExtractVectorElementWithBuildVector(const MachineOperand &MO,
                                                 BuildFnTy &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8012f919227778..383589add7755c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1532,13 +1532,6 @@ def extract_vector_element_build_vector_trunc8 : GICombineRule<
    [{ return Helper.matchExtractVectorElementWithBuildVectorTrunc(${root}, ${matchinfo}); }]),
    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
 
-def extract_vector_element_freeze : GICombineRule<
-   (defs root:$root, build_fn_matchinfo:$matchinfo),
-   (match (G_FREEZE $src, $input),
-          (G_EXTRACT_VECTOR_ELT $root, $src, $idx),
-   [{ return Helper.matchExtractVectorElementWithFreeze(${root}, ${matchinfo}); }]),
-   (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
-
 def sext_trunc : GICombineRule<
    (defs root:$root, build_fn_matchinfo:$matchinfo),
    (match (G_TRUNC $src, $x, (MIFlags NoSWrap)),
@@ -1636,7 +1629,6 @@ extract_vector_element_build_vector_trunc5,
 extract_vector_element_build_vector_trunc6,
 extract_vector_element_build_vector_trunc7,
 extract_vector_element_build_vector_trunc8,
-extract_vector_element_freeze,
 extract_vector_element_shuffle_vector,
 insert_vector_element_extract_vector_element
 ]>;
@@ -1713,6 +1705,17 @@ def integer_reassoc_combines: GICombineGroup<[
   APlusBMinusCPlusA
 ]>;
 
+def freeze_of_non_undef_non_poison : GICombineRule<
+   (defs root:$root),
+   (match (G_FREEZE $root, $src),
+          [{ return isGuaranteedNotToBeUndefOrPoison(${src}.getReg(), MRI); }]),
+   (apply (GIReplaceReg $root, $src))>;
+
+def freeze_combines: GICombineGroup<[
+  freeze_of_non_undef_non_poison,
+  push_freeze_to_prevent_poison_from_propagating
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1771,7 +1774,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
                                            constant_fold_fp_binop]>;
 
 def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
-    vector_ops_combines,
+    vector_ops_combines, freeze_combines,
     insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
     combine_extracted_vector_load,
     undef_combines, identity_combines, phi_combines,
@@ -1793,8 +1796,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
-    sext_trunc, zext_trunc, combine_shuffle_concat,
-    push_freeze_to_prevent_poison_from_propagating]>;
+    sext_trunc, zext_trunc, combine_shuffle_concat]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4cc602b5c87092..dcc1335a4bd44b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -265,11 +265,14 @@ bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
     }
   }
 
-  cast<GenericMachineInstr>(OrigDef)->dropPoisonGeneratingFlags();
-
   // Eliminate freeze if all operands are guaranteed non-poison.
   if (!MaybePoisonOperand) {
-    MatchInfo = [=](MachineIRBuilder &B) { MRI.replaceRegWith(DstOp, OrigOp); };
+    MatchInfo = [=](MachineIRBuilder &B) {
+      Observer.changingInstr(*OrigDef);
+      cast<GenericMachineInstr>(OrigDef)->dropPoisonGeneratingFlags();
+      Observer.changedInstr(*OrigDef);
+      B.buildCopy(DstOp, OrigOp);
+    };
     return true;
   }
 
@@ -277,6 +280,9 @@ bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
   LLT MaybePoisonOperandRegTy = MRI.getType(MaybePoisonOperandReg);
 
   MatchInfo = [=](MachineIRBuilder &B) mutable {
+    Observer.changingInstr(*OrigDef);
+    cast<GenericMachineInstr>(OrigDef)->dropPoisonGeneratingFlags();
+    Observer.changedInstr(*OrigDef);
     B.setInsertPt(*OrigDef->getParent(), OrigDef->getIterator());
     auto Freeze = B.buildFreeze(MaybePoisonOperandRegTy, MaybePoisonOperandReg);
     replaceRegOpWith(
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index 21b1eb26281742..b4765fb280f9dd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -144,59 +144,6 @@ bool CombinerHelper::matchExtractVectorElementWithDifferentIndices(
   return false;
 }
 
-bool CombinerHelper::matchExtractVectorElementWithFreeze(
-    const MachineOperand &MO, BuildFnTy &MatchInfo) {
-  MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
-  GExtractVectorElement *Extract = cast<GExtractVectorElement>(Root);
-
-  Register Vector = Extract->getVectorReg();
-
-  //
-  //  %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
-  //  %freeze:_(<2 x s32>) = G_FREEZE %bv(<2 x s32>)
-  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %bv(<2 x s32>), %opaque(s64)
-  //
-  //  -->
-  //
-  //  %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
-  //  %extract:_(s32) = G_EXTRACT_VECTOR_ELT %bv(<2 x s32>), %opaque(s64)
-  //  %freeze:_(s32) = G_FREEZE %extract(s32)
-  //
-  //
-
-  // For G_FREEZE, the input and the output types are identical. Moving the
-  // freeze from the Vector into the front of the extract preserves the freeze
-  // semantics. The result is still freeze'd. Furthermore, the Vector register
-  // becomes easier to analyze. A build vector could have been hidden behind the
-  // freeze.
-
-  // We expect a freeze on the Vector register.
-  GFreeze *Freeze = getOpcodeDef<GFreeze>(Vector, MRI);
-  if (!Freeze)
-    return false;
-
-  Register Dst = Extract->getReg(0);
-  LLT DstTy = MRI.getType(Dst);
-
-  // We first have to check for one-use and legality of the freeze.
-  // The type of the extractVectorElement did not change.
-  if (!MRI.hasOneNonDBGUse(Freeze->getReg(0)) ||
-      !isLegalOrBeforeLegalizer({TargetOpcode::G_FREEZE, {DstTy}}))
-    return false;
-
-  Register Index = Extract->getIndexReg();
-
-  // We move the freeze from the Vector register in front of the
-  // extractVectorElement.
-  MatchInfo = [=](MachineIRBuilder &B) {
-    auto Extract =
-        B.buildExtractVectorElement(DstTy, Freeze->getSourceReg(), Index);
-    B.buildFreeze(Dst, Extract);
-  };
-
-  return true;
-}
-
 bool CombinerHelper::matchExtractVectorElementWithBuildVector(
     const MachineOperand &MO, BuildFnTy &MatchInfo) {
   MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index f455482e02943f..e8438be94b3cd2 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1724,6 +1724,39 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   }
 }
 
+/// Shifts return poison if shiftwidth is larger than the bitwidth.
+static bool shiftAmountKnownInRange(Register ShiftAmount,
+                                    const MachineRegisterInfo &MRI) {
+  LLT Ty = MRI.getType(ShiftAmount);
+
+  if (Ty.isScalableVector())
+    return false; // Can't tell, just return false to be safe
+
+  if (Ty.isScalar()) {
+    std::optional<ValueAndVReg> Val =
+        getIConstantVRegValWithLookThrough(ShiftAmount, MRI);
+    if (!Val)
+      return false;
+    return Val->Value.ult(Ty.getScalarSizeInBits());
+  }
+
+  GBuildVector *BV = getOpcodeDef<GBuildVector>(ShiftAmount, MRI);
+  if (!BV)
+    return false;
+
+  unsigned Sources = BV->getNumSources();
+  for (unsigned I = 0; I < Sources; ++I) {
+    std::optional<ValueAndVReg> Val =
+        getIConstantVRegValWithLookThrough(BV->getSourceReg(I), MRI);
+    if (!Val)
+      return false;
+    if (!Val->Value.ult(Ty.getScalarSizeInBits()))
+      return false;
+  }
+
+  return true;
+}
+
 namespace {
 enum class UndefPoisonKind {
   PoisonOnly = (1 << 0),
@@ -1732,11 +1765,11 @@ enum class UndefPoisonKind {
 };
 }
 
-[[maybe_unused]] static bool includesPoison(UndefPoisonKind Kind) {
+static bool includesPoison(UndefPoisonKind Kind) {
   return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0;
 }
 
-[[maybe_unused]] static bool includesUndef(UndefPoisonKind Kind) {
+static bool includesUndef(UndefPoisonKind Kind) {
   return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0;
 }
 
@@ -1745,18 +1778,55 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
                                    UndefPoisonKind Kind) {
   MachineInstr *RegDef = MRI.getVRegDef(Reg);
 
-  if (auto *GMI = dyn_cast<GenericMachineInstr>(RegDef)) {
-    if (ConsiderFlagsAndMetadata && includesPoison(Kind) &&
-        GMI->hasPoisonGeneratingFlags())
-      return true;
-  } else {
-    // Conservatively return true.
-    return true;
-  }
+  if (ConsiderFlagsAndMetadata && includesPoison(Kind))
+    if (auto *GMI = dyn_cast<GenericMachineInstr>(RegDef))
+      if (GMI->hasPoisonGeneratingFlags())
+        return true;
 
+  // Check whether opcode is a poison/undef-generating operation.
   switch (RegDef->getOpcode()) {
   case TargetOpcode::G_FREEZE:
+  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
     return false;
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+    return includesPoison(Kind) &&
+           !shiftAmountKnownInRange(RegDef->getOperand(2).getReg(), MRI);
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI:
+    // fptosi/ui yields poison if the resulting value does not fit in the
+    // destination type.
+    return true;
+  case TargetOpcode::G_CTLZ:
+  case TargetOpcode::G_CTTZ:
+  case TargetOpcode::G_ABS:
+  case TargetOpcode::G_CTPOP:
+  case TargetOpcode::G_BSWAP:
+  case TargetOpcode::G_BITREVERSE:
+  case TargetOpcode::G_FSHL:
+  case TargetOpcode::G_FSHR:
+  case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_UMAX:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_PTRMASK:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_SSUBSAT:
+  case TargetOpcode::G_USUBSAT:
+    return false;
+  case TargetOpcode::G_SSHLSAT:
+  case TargetOpcode::G_USHLSAT:
+    return includesPoison(Kind) &&
+           !shiftAmountKnownInRange(RegDef->getOperand(2).getReg(), MRI);
   default:
     return !isa<GCastOp>(RegDef) && !isa<GBinOp>(RegDef);
   }
@@ -1776,6 +1846,18 @@ static bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
     return true;
   case TargetOpcode::G_IMPLICIT_DEF:
     return !includesUndef(Kind);
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+    return true;
+  case TargetOpcode::G_BUILD_VECTOR: {
+    GBuildVector *BV = cast<GBuildVector>(RegDef);
+    unsigned NumSources = BV->getNumSources();
+    for (unsigned I = 0; I < NumSources; ++I)
+      if (!::isGuaranteedNotToBeUndefOrPoison(BV->getSourceReg(I), MRI,
+                                              Depth + 1, Kind))
+        return false;
+    return true;
+  }
   default: {
     auto MOCheck = [&](const MachineOperand &MO) {
       if (!MO.isReg())
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index d5d33742148ada..70241e71aa593f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -361,8 +361,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %vec:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: %idx:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT %vec(<2 x s64>), %idx(s64)
-    ; CHECK-NEXT: %extract:_(s64) = G_FREEZE [[EVEC]]
+    ; CHECK-NEXT: %fvec:_(<2 x s64>) = G_FREEZE %vec
+    ; CHECK-NEXT: %extract:_(s64) = G_EXTRACT_VECTOR_ELT %fvec(<2 x s64>), %idx(s64)
     ; CHECK-NEXT: $x0 = COPY %extract(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %vec:_(<2 x s64>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
new file mode 100644
index 00000000000000..5ec8ef5cdcb196
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
@@ -0,0 +1,1154 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s
+
+...
+---
+name:            freeze_register
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_register
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_FREEZE %0
+    $x0 = COPY %1(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            freeze_constant
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_constant
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: $x0 = COPY [[C]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %1:_(s64) = G_CONSTANT i64 9
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            freeze_fconstant
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fconstant
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 9.000000e+00
+    ; CHECK-NEXT: $x0 = COPY [[C]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %1:_(s64) = G_FCONSTANT double 9.0
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_undef
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_undef
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %1:_(s64) = G_IMPLICIT_DEF
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_freeze
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_freeze
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = G_FREEZE %0
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_buildvector
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_buildvector
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY]](s32), [[COPY]](s32)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR]]
+    ; CHECK-NEXT: $q0 = COPY [[FREEZE]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(s32) = COPY $w0
+    %1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %0(s32), %0(s32), %0(s32)
+    %2:_(<4 x s32>) = G_FREEZE %1
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            freeze_buildvector_const
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_buildvector_const
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %c:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR %c(s32), %c(s32), %c(s32), %c(s32)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(s32) = COPY $w0
+    %c:_(s32) = G_CONSTANT i32 6
+    %1:_(<4 x s32>) = G_BUILD_VECTOR %c(s32), %c(s32), %c(s32), %c(s32)
+    %2:_(<4 x s32>) = G_FREEZE %1
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            freeze_disjoint_or_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_disjoint_or_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: $x0 = COPY %c(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = disjoint G_OR %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_or_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_or_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: $x0 = COPY %c(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_OR %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_nneg_zext_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_nneg_zext_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 9
+    ; CHECK-NEXT: %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %c(s32)
+    ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s32) = G_CONSTANT i32 9
+    %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = nneg G_ZEXT %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_zext_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_zext_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 9
+    ; CHECK-NEXT: %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %c(s32)
+    ; CHECK-NEXT: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s32) = G_CONSTANT i32 9
+    %c:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_ZEXT %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_udiv_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_udiv_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[UDIV]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_UDIV %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_exact_udiv_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_exact_udiv_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[UDIV]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = exact G_UDIV %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_mul_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_mul_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[MUL]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_MUL %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_nsw_mul_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_nsw_mul_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[MUL]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = nsw G_MUL %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_trunc_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_trunc_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %c(s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s32) = G_TRUNC %c
+    %2:_(s32) = G_FREEZE %1
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            freeze_nuw_trunc_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_nuw_trunc_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %c(s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s32) = nuw G_TRUNC %c
+    %2:_(s32) = G_FREEZE %1
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            freeze_add_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_add_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_ADD %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_nuw_add_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_nuw_add_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = nuw G_ADD %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_xor_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_xor_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR %c, %c
+    ; CHECK-NEXT: $x0 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_XOR %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_fptosi_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fptosi_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:_(s64) = G_FPTOSI %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[FPTOSI]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_FPTOSI %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_fptoui_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fptoui_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s64) = G_FPTOUI %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[FPTOUI]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_FPTOUI %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_shl_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_shl_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL %c, %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[SHL]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SHL %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ashr_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ashr_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR %c, %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ASHR]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_ASHR %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_lshr_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_lshr_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR %c, %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[LSHR]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_LSHR %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ctlz_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ctlz_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ %c(s64)
+    ; CHECK-NEXT: $x0 = COPY [[CTLZ]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_CTLZ %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_cttz_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_cttz_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[CTTZ:%[0-9]+]]:_(s64) = G_CTTZ %c(s64)
+    ; CHECK-NEXT: $x0 = COPY [[CTTZ]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_CTTZ %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_abs_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_abs_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(s64) = G_ABS %c
+    ; CHECK-NEXT: $x0 = COPY [[ABS]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_ABS %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_bswap_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_bswap_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP %c
+    ; CHECK-NEXT: $x0 = COPY [[BSWAP]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_BSWAP %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_bitreverse_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_bitreverse_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE %c
+    ; CHECK-NEXT: $x0 = COPY [[BITREVERSE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_BITREVERSE %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_icmp_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_icmp_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cmp:_(s1) = G_ICMP intpred(eq), %c(s64), %d
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %cmp
+    ; CHECK-NEXT: %ext:_(s64) = G_ZEXT [[FREEZE]](s1)
+    ; CHECK-NEXT: $x0 = COPY %ext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %cmp:_(s1) = G_ICMP intpred(eq), %c(s64), %d
+    %2:_(s1) = G_FREEZE %cmp
+    %ext:_(s64) = G_ZEXT %2(s1)
+    $x0 = COPY %ext(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_fcmp_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fcmp_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cmp:_(s1) = G_FCMP floatpred(oeq), %c(s64), %d
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %cmp
+    ; CHECK-NEXT: %ext:_(s64) = G_ZEXT [[FREEZE]](s1)
+    ; CHECK-NEXT: $x0 = COPY %ext(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %cmp:_(s1) = G_FCMP floatpred(oeq), %c(s64), %d
+    %2:_(s1) = G_FREEZE %cmp
+    %ext:_(s64) = G_ZEXT %2(s1)
+    $x0 = COPY %ext(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_fshl_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fshl_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ROTL:%[0-9]+]]:_(s64) = G_ROTL %c, %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ROTL]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_FSHL %c, %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_fshr_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_fshr_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ROTR:%[0-9]+]]:_(s64) = G_ROTR %c, %c(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[ROTR]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_FSHR %c, %c, %c
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_smax_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_smax_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[SMAX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SMAX %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_smin_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_smin_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[SMIN]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SMIN %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_umax_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_umax_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[UMAX:%[0-9]+]]:_(s64) = G_UMAX %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[UMAX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_UMAX %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_umin_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_umin_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[UMIN]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_UMIN %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ptrmask_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ptrmask_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %p:_(p0) = COPY $x0
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(p0) = G_FREEZE %p
+    ; CHECK-NEXT: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[FREEZE]], %cst(s64)
+    ; CHECK-NEXT: $x0 = COPY [[PTRMASK]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %p:_(p0) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(p0) = G_PTRMASK %p, %cst
+    %2:_(p0) = G_FREEZE %1
+    $x0 = COPY %2(p0)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_saddo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_saddo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_SADDO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ssubo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ssubo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_SSUBO %c, %d
+    ; CHECK-NEXT: $x0 = COPY %4(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_SSUBO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_uaddo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_uaddo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_UADDO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_usubo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_usubo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_USUBO %c, %d
+    ; CHECK-NEXT: $x0 = COPY %4(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_USUBO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_smulo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_smulo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_SMULO %c, %d
+    ; CHECK-NEXT: $x0 = COPY %4(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_SMULO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_umulo_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_umulo_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %4:_(s64), %o:_(s1) = G_UMULO %c, %d
+    ; CHECK-NEXT: $x0 = COPY %4(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64), %o:_(s1) = G_UMULO %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_saddsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_saddsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SADDSAT:%[0-9]+]]:_(s64) = G_SADDSAT %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[SADDSAT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SADDSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_uaddsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_uaddsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[UADDSAT:%[0-9]+]]:_(s64) = G_UADDSAT %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[UADDSAT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_UADDSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ssubsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ssubsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SSUBSAT:%[0-9]+]]:_(s64) = G_SSUBSAT %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[SSUBSAT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SSUBSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_usubsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_usubsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[USUBSAT:%[0-9]+]]:_(s64) = G_USUBSAT %c, %d
+    ; CHECK-NEXT: $x0 = COPY [[USUBSAT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_USUBSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_sshlsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_sshlsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT %c, %d(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[SSHLSAT]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_SSHLSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            freeze_ushlsat_fold_barrier
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: freeze_ushlsat_fold_barrier
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s64) = G_CONSTANT i64 9
+    ; CHECK-NEXT: %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT %c, %d(s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[USHLSAT]]
+    ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %cst:_(s64) = G_CONSTANT i64 9
+    %c:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %d:_(s64) = G_CONSTANT_FOLD_BARRIER %cst
+    %1:_(s64) = G_USHLSAT %c, %d
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
index 0c67a867580ccd..c000a8e635bc6b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
@@ -253,10 +253,10 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s8>) = G_FREEZE [[BUILD_VECTOR]]
-    ; CHECK-NEXT: G_STORE [[FREEZE]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
     ; CHECK-NEXT: RET_ReallyLR
     %3:_(s8) = G_CONSTANT i8 127
     %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir
index ca403f85156113..767ece62b8731f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-crash.mir
@@ -24,8 +24,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-  ; CHECK-NEXT:   [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[C1]]
-  ; CHECK-NEXT:   [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[FREEZE]], [[C]]
+  ; CHECK-NEXT:   [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[C1]], [[C]]
   ; CHECK-NEXT:   G_STORE [[UDIV]](s64), [[COPY]](p0) :: (store (s64))
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
index 6ad4a5ae572e0e..65701343ccc1e5 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
@@ -1,175 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FASTISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
 
 ; First test the different supported value types for select.
 define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
-; CHECK-LABEL: select_i1
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+; GISEL-LABEL: select_i1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_i1:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    csel w8, w1, w2, ne
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_i1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 %a, i1 %b
   ret i1 %1
 }
 
 define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
-; CHECK-LABEL: select_i8
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+; GISEL-LABEL: select_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_i8:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    csel w8, w1, w2, ne
+; CHECK-FASTISEL-NEXT:    uxtb w0, w8
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_i8:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i8 %a, i8 %b
   ret i8 %1
 }
 
 define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
-; CHECK-LABEL: select_i16
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+; GISEL-LABEL: select_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_i16:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    csel w8, w1, w2, ne
+; CHECK-FASTISEL-NEXT:    uxth w0, w8
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i16 %a, i16 %b
   ret i16 %1
 }
 
 define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
-; CHECK-LABEL: select_i32
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+; GISEL-LABEL: select_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_i32:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    csel w0, w1, w2, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i32 %a, i32 %b
   ret i32 %1
 }
 
 define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
-; CHECK-LABEL: select_i64
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  csel {{x[0-9]+}}, x1, x2, ne
+; GISEL-LABEL: select_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel x0, x1, x2, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_i64:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    csel x0, x1, x2, ne
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_i64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    csel x0, x1, x2, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i64 %a, i64 %b
   ret i64 %1
 }
 
 define float @select_f32(i1 zeroext %c, float %a, float %b) {
-; CHECK-LABEL: select_f32
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
-; GISEL-LABEL: select_f32
-; GISEL:       {{cmp w0, #0|tst w0, #0x1}}
-; GISEL-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+; GISEL-LABEL: select_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    fcsel s0, s0, s1, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_f32:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, float %a, float %b
   ret float %1
 }
 
 define double @select_f64(i1 zeroext %c, double %a, double %b) {
-; CHECK-LABEL: select_f64
-; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
-; CHECK-NEXT:  fcsel {{d[0-9]+}}, d0, d1, ne
-; GISEL-LABEL: select_f64
-; GISEL:       {{cmp w0, #0|tst w0, #0x1}}
-; GISEL-NEXT:  fcsel {{d[0-9]+}}, d0, d1, ne
+; GISEL-LABEL: select_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    fcsel d0, d0, d1, ne
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_f64:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    tst w0, #0x1
+; CHECK-FASTISEL-NEXT:    fcsel d0, d0, d1, ne
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_f64:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    tst w0, #0x1
+; CHECK-GISEL-NEXT:    fcsel d0, d0, d1, ne
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, double %a, double %b
   ret double %1
 }
 
 ; Now test the folding of all compares.
 define float @select_fcmp_false(float %x, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_false
-; CHECK:       fmov {{s[0-9]+}}, s2
+; CHECK-FASTISEL-LABEL: select_fcmp_false:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    fmov s0, s2
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_fcmp_false:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    fcmp s0, s0
+; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, gt
+; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_false:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s0
+; GISEL-NEXT:    fcsel s0, s1, s2, gt
+; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ogt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, gt
+; CHECK-LABEL: select_fcmp_ogt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, gt
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ogt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, gt
+; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_oge
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ge
+; CHECK-LABEL: select_fcmp_oge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, ge
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_oge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ge
+; GISEL-NEXT:    ret
   %1 = fcmp oge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_olt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, mi
+; CHECK-LABEL: select_fcmp_olt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, mi
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_olt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, mi
+; GISEL-NEXT:    ret
   %1 = fcmp olt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ole
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ls
+; CHECK-LABEL: select_fcmp_ole:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, ls
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ole:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ls
+; GISEL-NEXT:    ret
   %1 = fcmp ole float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_one
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, mi
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], gt
+; CHECK-FASTISEL-LABEL: select_fcmp_one:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    fcmp s0, s1
+; CHECK-FASTISEL-NEXT:    fcsel s0, s2, s3, mi
+; CHECK-FASTISEL-NEXT:    fcsel s0, s2, s0, gt
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_fcmp_one:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    fcmp s0, s1
+; CHECK-GISEL-NEXT:    cset w8, mi
+; CHECK-GISEL-NEXT:    cset w9, gt
+; CHECK-GISEL-NEXT:    orr w8, w8, w9
+; CHECK-GISEL-NEXT:    tst w8, #0x1
+; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
+; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_one:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp one float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ord
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vc
+; CHECK-LABEL: select_fcmp_ord:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, vc
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ord:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, vc
+; GISEL-NEXT:    ret
   %1 = fcmp ord float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_uno
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vs
+; CHECK-LABEL: select_fcmp_uno:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, vs
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_uno:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, vs
+; GISEL-NEXT:    ret
   %1 = fcmp uno float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ueq
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, eq
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], vs
+; CHECK-FASTISEL-LABEL: select_fcmp_ueq:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    fcmp s0, s1
+; CHECK-FASTISEL-NEXT:    fcsel s0, s2, s3, eq
+; CHECK-FASTISEL-NEXT:    fcsel s0, s2, s0, vs
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_fcmp_ueq:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    fcmp s0, s1
+; CHECK-GISEL-NEXT:    cset w8, eq
+; CHECK-GISEL-NEXT:    cset w9, vs
+; CHECK-GISEL-NEXT:    orr w8, w8, w9
+; CHECK-GISEL-NEXT:    tst w8, #0x1
+; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
+; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ueq:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ugt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, hi
+; CHECK-LABEL: select_fcmp_ugt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, hi
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ugt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, hi
+; GISEL-NEXT:    ret
   %1 = fcmp ugt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_uge
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, pl
+; CHECK-LABEL: select_fcmp_uge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, pl
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_uge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, pl
+; GISEL-NEXT:    ret
   %1 = fcmp uge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ult
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, lt
+; CHECK-LABEL: select_fcmp_ult:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, lt
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ult:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, lt
+; GISEL-NEXT:    ret
   %1 = fcmp ult float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -177,116 +384,221 @@ define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
 
 
 define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_ule
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, le
+; CHECK-LABEL: select_fcmp_ule:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, le
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ule:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, le
+; GISEL-NEXT:    ret
   %1 = fcmp ule float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_une
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ne
+; CHECK-LABEL: select_fcmp_une:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, ne
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_une:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp une float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_true(float %x, float %a, float %b) {
-; CHECK-LABEL: select_fcmp_true
-; CHECK:       fmov {{s[0-9]+}}, s1
+; CHECK-FASTISEL-LABEL: select_fcmp_true:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    fmov s0, s1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_fcmp_true:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    fcmp s0, s0
+; CHECK-GISEL-NEXT:    cset w8, eq
+; CHECK-GISEL-NEXT:    cset w9, vs
+; CHECK-GISEL-NEXT:    orr w8, w8, w9
+; CHECK-GISEL-NEXT:    tst w8, #0x1
+; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, ne
+; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s1, s2, ne
+; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_eq
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, eq
+; CHECK-LABEL: select_icmp_eq:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, eq
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_eq:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, eq
+; GISEL-NEXT:    ret
   %1 = icmp eq i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_ne
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+; CHECK-LABEL: select_icmp_ne:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ne:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ne
+; GISEL-NEXT:    ret
   %1 = icmp ne i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_ugt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hi
+; CHECK-LABEL: select_icmp_ugt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, hi
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ugt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, hi
+; GISEL-NEXT:    ret
   %1 = icmp ugt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_uge
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hs
+; CHECK-LABEL: select_icmp_uge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, hs
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_uge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, hs
+; GISEL-NEXT:    ret
   %1 = icmp uge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_ult
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lo
+; CHECK-LABEL: select_icmp_ult:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, lo
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ult:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, lo
+; GISEL-NEXT:    ret
   %1 = icmp ult i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_ule
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ls
+; CHECK-LABEL: select_icmp_ule:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, ls
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ule:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ls
+; GISEL-NEXT:    ret
   %1 = icmp ule i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_sgt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, gt
+; CHECK-LABEL: select_icmp_sgt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sgt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, gt
+; GISEL-NEXT:    ret
   %1 = icmp sgt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_sge
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ge
+; CHECK-LABEL: select_icmp_sge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, ge
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ge
+; GISEL-NEXT:    ret
   %1 = icmp sge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_slt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lt
+; CHECK-LABEL: select_icmp_slt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, lt
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_slt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, lt
+; GISEL-NEXT:    ret
   %1 = icmp slt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
-; CHECK-LABEL: select_icmp_sle
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, le
+; CHECK-LABEL: select_icmp_sle:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    fcsel s0, s0, s1, le
+; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sle:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, le
+; GISEL-NEXT:    ret
   %1 = icmp sle i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -294,30 +606,86 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
 
 ; Test peephole optimizations for select.
 define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-LABEL: select_opt1
-; CHECK:       orr {{w[0-9]+}}, w0, w1
+; GISEL-LABEL: select_opt1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    orr w8, w0, w1
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_opt1:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    orr w8, w0, w1
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_opt1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    orr w8, w0, w1
+; CHECK-GISEL-NEXT:    and w0, w8, #0x1
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 true, i1 %a
   ret i1 %1
 }
 
 define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-LABEL: select_opt2
-; CHECK:       eor [[REG:w[0-9]+]], w0, #0x1
-; CHECK:       orr {{w[0-9]+}}, [[REG]], w1
+; GISEL-LABEL: select_opt2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    eor w8, w0, #0x1
+; GISEL-NEXT:    orr w8, w8, w1
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_opt2:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    eor w8, w0, #0x1
+; CHECK-FASTISEL-NEXT:    orr w8, w8, w1
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_opt2:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    eor w8, w0, #0x1
+; CHECK-GISEL-NEXT:    orr w8, w8, w1
+; CHECK-GISEL-NEXT:    and w0, w8, #0x1
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 %a, i1 true
   ret i1 %1
 }
 
 define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-LABEL: select_opt3
-; CHECK:       bic {{w[0-9]+}}, w1, w0
+; GISEL-LABEL: select_opt3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    eor w8, w0, #0x1
+; GISEL-NEXT:    and w0, w8, w1
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_opt3:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    bic w8, w1, w0
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_opt3:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    eor w8, w0, #0x1
+; CHECK-GISEL-NEXT:    and w0, w8, w1
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 false, i1 %a
   ret i1 %1
 }
 
 define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-LABEL: select_opt4
-; CHECK:       and {{w[0-9]+}}, w0, w1
+; GISEL-LABEL: select_opt4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    and w0, w0, w1
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_opt4:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    and w8, w0, w1
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_opt4:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    and w0, w0, w1
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 %a, i1 false
   ret i1 %1
 }
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index b2f9bf89d9ec60..7d8eba1e870804 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1509,49 +1509,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v1, v12, v1
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v12, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v12, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v10, v4
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v3, v10, v3
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v2, v10, v2
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v12
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[6:7], v2, v12, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[6:7], v4, v10, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[6:7], v3, v10, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v15
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v16
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v5, v11, v5
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v8, v11, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v8, v11, v5
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v5, v11, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v9, v7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v9, v6
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v5, s[6:7], v5, v11
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v13
-; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v9, v4
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v8, s[6:7], v8, v11
 ; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[6:7], v5, v11, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[6:7], v7, v9, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[6:7], v4, v9, s[6:7]
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
 ; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v11, v11, v12
@@ -1560,97 +1550,69 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_xor_b32_e64 v9, v9, v10
 ; GFX9-G-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v9, v12
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v8, v7
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v5, v4
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v9, v12
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v1, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v2, v3
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
 ; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
 ; GFX9-G-O0-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v11
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 32
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 32
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v8, v8, v9
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v8
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 64
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v6, v5, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
 ; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v7
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[8:9]
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s16, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], v[9:10]
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v2
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v7, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v7
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s10
-; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v6, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX9-G-O0-NEXT:    v_add_u32_e64 v7, v5, v7
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-G-O0-NEXT:    v_ffbh_u32_e64 v8, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, 32
 ; GFX9-G-O0-NEXT:    v_add_u32_e64 v8, v8, v9
-; GFX9-G-O0-NEXT:    v_min_u32_e64 v6, v6, v8
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-G-O0-NEXT:    v_min_u32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s15, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s11, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s14, 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[8:9], v5, v6
-; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v7, s[8:9], v4, v5
+; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s16
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s16
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9]
-; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[8:9], v4, v5, s[8:9]
+; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s14
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9]
@@ -1659,8 +1621,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9]
 ; GFX9-G-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, s5
@@ -1685,35 +1647,27 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v5, v10
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 0x7f
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s7
-; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v7, s6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v8, v7, v8
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s4
-; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v2
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v1, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v1
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v7, v7, s7
+; GFX9-G-O0-NEXT:    v_xor_b32_e64 v4, v4, s6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v7, v9
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v8
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[7:8], v[9:10]
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v7
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v2, v4, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-G-O0-NEXT:    v_and_b32_e32 v3, 1, v5
-; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v3
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX9-G-O0-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
@@ -1883,10 +1837,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s6, v16, 6
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s7, v16, 7
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
@@ -1899,14 +1853,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v2
@@ -1915,7 +1869,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[21:22], v2, v[0:1]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[14:15], v2, v[0:1]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[3:4]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
@@ -1929,9 +1883,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v7, v2, v3
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v5, v0, v1
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr23_vgpr24 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v25
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v26
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 31
@@ -1939,81 +1893,73 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v3, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v15
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v2, v3
 ; GFX9-G-O0-NEXT:    v_or_b32_e64 v9, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v23
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v24
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v25
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v26
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[23:24], v0, v[2:3]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[27:28], v0, v[2:3]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[12:13]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr2 killed $exec
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[14:15]
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr2 killed $exec
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v14, v2, v3
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v23, v2, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v29, v31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v30, v32
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v33
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v34
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v25, v33
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v26, v34
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v29
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v30
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v24
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v15
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v13, v1, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v27
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v28
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v24
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v15, v1, v15
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v22
-; GFX9-G-O0-NEXT:    v_or3_b32 v12, v12, v14, v15
-; GFX9-G-O0-NEXT:    v_or3_b32 v2, v2, v3, v13
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v25
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v26
+; GFX9-G-O0-NEXT:    v_or3_b32 v14, v14, v23, v24
+; GFX9-G-O0-NEXT:    v_or3_b32 v2, v2, v3, v15
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v2
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v12
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v15
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v11, s[8:9], v11, v4
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
-; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9]
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v13, s[8:9], v13, v4
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v12, v9, s[8:9]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[8:9], v10, v7, s[8:9]
+; GFX9-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[8:9], v6, v5, s[8:9]
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v8, v6, v10
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v10, v6, v12
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v6, v6, v10
+; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v6, v6, v12
 ; GFX9-G-O0-NEXT:    s_mov_b32 s9, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v12, v8, s9
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v8, s8
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v12, v10, s9
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v14, v10, s8
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, s5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, s4
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v25
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v26
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v27
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v28
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v23
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v24
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v11, v8, v11
-; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v8, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, v21
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v23
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v24
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT:    v_and_b32_e64 v10, v10, v22
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
 ; GFX9-G-O0-NEXT:    v_and_b32_e64 v6, v6, v21
 ; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v11
@@ -2114,66 +2060,62 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v4
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v7
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v21, v6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v24, v17
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v22, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v13, v4
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v4, v19, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v13
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v19
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v13, v6
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v19, v6
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v13, v6
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v13, v[21:22]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[26:27], v13, v[15:16]
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v6
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v19, v[21:22]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[26:27], v19, v[23:24]
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[24:25], v5, v[21:22]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v26
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v20, v26
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v27
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v23, v24
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v25
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v14, v14, v23
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v13, v5, v13
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v19, v25
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v20, v20, v23
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v19, v5, v19
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[21:22], v4, v[21:22]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v21
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v22
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[4:5]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v16
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v14, s[6:7]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v13, v5, v13, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v19, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v18, s[6:7]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[4:5]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-G-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v18, v6
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v13
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v17
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v19
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v20
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, -1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, -1
@@ -2226,14 +2168,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-G-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s10, 0
@@ -2241,48 +2183,50 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b32 s8, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v2, v5
+; GFX9-G-O0-NEXT:    v_add_co_u32_e64 v5, s[6:7], v3, v5
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v8
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v7
-; GFX9-G-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v6, v8, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v8, s[6:7], v7, v8, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s8
+; GFX9-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[6:7], v2, v7, s[6:7]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v16, v8
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v17, v7
+; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-G-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0x7f
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[6:7], v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-G-O0-NEXT:    v_sub_co_u32_e64 v9, s[6:7], v2, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s7, 64
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v10
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-G-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v10
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v4
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v3, v4, v1
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v3, v9, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_sub_u32_e64 v9, v1, v4
+; GFX9-G-O0-NEXT:    v_sub_u32_e64 v15, v1, v9
 ; GFX9-G-O0-NEXT:    s_mov_b32 s6, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v4, v1
+; GFX9-G-O0-NEXT:    v_cmp_lt_u32_e64 s[8:9], v9, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v4, v1
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[1:2], v4, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[18:19], v9, v[13:14]
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v4, v[11:12]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v18
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v19
+; GFX9-G-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v9, v1
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[1:2], v9, v[13:14]
+; GFX9-G-O0-NEXT:    v_lshrrev_b64 v[18:19], v15, v[13:14]
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v9, v[11:12]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v12, v18
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v19
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v15, v16
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v17
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v10, v10, v15
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v11, v9, v11
 ; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[13:14]
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v2
@@ -2294,10 +2238,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v9, v13
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
-; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[8:9]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v10, v11
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[8:9]
+; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[8:9]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
 ; GFX9-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
 ; GFX9-G-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec

From c9c244423ffb8071bb838c3606052e12af537047 Mon Sep 17 00:00:00 2001
From: Angel Zhang <angel.zhang@amd.com>
Date: Wed, 29 May 2024 12:19:32 -0400
Subject: [PATCH 144/230] [mlir][spirv] Add integration test for
 `vector.interleave` and `vector.shuffle` (#93595)

- Add integration test for `vector.shuffle` and `vector.interleave`,
mentioned in issue #91978
- Add `VectorToSPIRV` patterns to `GPUToSPIRVPass`

---------

Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
---
 .../Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp  |  2 +
 .../mlir-vulkan-runner/vector-interleave.mlir | 53 +++++++++++++++++++
 .../mlir-vulkan-runner/vector-shuffle.mlir    | 53 +++++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 mlir/test/mlir-vulkan-runner/vector-interleave.mlir
 create mode 100644 mlir/test/mlir-vulkan-runner/vector-shuffle.mlir

diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 1d1db913e3df23..53e73ec0d81bf0 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
 #include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
+#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRV.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
@@ -132,6 +133,7 @@ void GPUToSPIRVPass::runOnOperation() {
     mlir::arith::populateArithToSPIRVPatterns(typeConverter, patterns);
     populateMemRefToSPIRVPatterns(typeConverter, patterns);
     populateFuncToSPIRVPatterns(typeConverter, patterns);
+    populateVectorToSPIRVPatterns(typeConverter, patterns);
 
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
       return signalPassFailure();
diff --git a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
new file mode 100644
index 00000000000000..2f5c319e2f5c5d
--- /dev/null
+++ b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-vulkan-runner %s \
+// RUN:  --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
+// RUN:  --entry-point-result=void | FileCheck %s
+
+// CHECK: [0, 2, 1, 3]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_vector_interleave(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<4xi32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %c0 = arith.constant 0 : index
+      %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32>
+      %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32>
+      %result = vector.interleave %vec0, %vec1 : vector<2xi32> -> vector<4xi32>
+      vector.store %result, %arg2[%c0] : memref<4xi32>, vector<4xi32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    // Allocate 3 buffers.
+    %buf0 = memref.alloc() : memref<2xi32>
+    %buf1 = memref.alloc() : memref<2xi32>
+    %buf2 = memref.alloc() : memref<4xi32>
+    
+    %idx0 = arith.constant 0 : index
+    %idx1 = arith.constant 1 : index
+    %idx4 = arith.constant 4 : index
+
+    // Initialize input buffer.
+    %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32>
+    %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32>
+    vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32>
+    vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32>
+
+    // Initialize output buffer.
+    %value0 = arith.constant 0 : i32
+    %buf3 = memref.cast %buf2 : memref<4xi32> to memref<?xi32>
+    call @fillResource1DInt(%buf3, %value0) : (memref<?xi32>, i32) -> ()
+
+    gpu.launch_func @kernels::@kernel_vector_interleave
+        blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1)
+        args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<4xi32>)
+    %buf4 = memref.cast %buf3 : memref<?xi32> to memref<*xi32>
+    call @printMemrefI32(%buf4) : (memref<*xi32>) -> ()
+    return
+  }
+  func.func private @fillResource1DInt(%0 : memref<?xi32>, %1 : i32)
+  func.func private @printMemrefI32(%ptr : memref<*xi32>)
+}
diff --git a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
new file mode 100644
index 00000000000000..e29e054ccd46be
--- /dev/null
+++ b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-vulkan-runner %s \
+// RUN:  --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
+// RUN:  --entry-point-result=void | FileCheck %s
+
+// CHECK: [2, 1, 3]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_vector_shuffle(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<3xi32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %c0 = arith.constant 0 : index
+      %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32>
+      %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32>
+      %result = vector.shuffle %vec0, %vec1[2, 1, 3] : vector<2xi32>, vector<2xi32>
+      vector.store %result, %arg2[%c0] : memref<3xi32>, vector<3xi32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    // Allocate 3 buffers.
+    %buf0 = memref.alloc() : memref<2xi32>
+    %buf1 = memref.alloc() : memref<2xi32>
+    %buf2 = memref.alloc() : memref<3xi32>
+    
+    %idx0 = arith.constant 0 : index
+    %idx1 = arith.constant 1 : index
+    %idx4 = arith.constant 4 : index
+
+    // Initialize input buffer
+    %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32>
+    %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32>
+    vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32>
+    vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32>
+
+    // Initialize output buffer.
+    %value0 = arith.constant 0 : i32
+    %buf3 = memref.cast %buf2 : memref<3xi32> to memref<?xi32>
+    call @fillResource1DInt(%buf3, %value0) : (memref<?xi32>, i32) -> ()
+
+    gpu.launch_func @kernels::@kernel_vector_shuffle
+        blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1)
+        args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<3xi32>)
+    %buf4 = memref.cast %buf3 : memref<?xi32> to memref<*xi32>
+    call @printMemrefI32(%buf4) : (memref<*xi32>) -> ()
+    return
+  }
+  func.func private @fillResource1DInt(%0 : memref<?xi32>, %1 : i32)
+  func.func private @printMemrefI32(%ptr : memref<*xi32>)
+}

From cfb209b92a26f16ed7413b32da20fc436eff8c58 Mon Sep 17 00:00:00 2001
From: Vy Nguyen <oontvoo@users.noreply.github.com>
Date: Wed, 29 May 2024 12:22:42 -0400
Subject: [PATCH 145/230] [lldb][lldb-dap] Cleanup breakpoint filters. (#87550)

Details:
- remove Swift breakpoint filter because this version of LLDB does not
support Swift.
  - only return objc filters when working on macos.
---
 lldb/include/lldb/API/SBDebugger.h    |  2 ++
 lldb/include/lldb/Symbol/TypeSystem.h |  1 +
 lldb/source/API/SBDebugger.cpp        |  4 +++
 lldb/source/Symbol/TypeSystem.cpp     | 11 ++++++++
 lldb/tools/lldb-dap/DAP.cpp           | 39 ++++++++++++++++++++-------
 lldb/tools/lldb-dap/DAP.h             |  4 ++-
 lldb/tools/lldb-dap/lldb-dap.cpp      |  6 +++--
 7 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index af19b1faf3bf51..84ea9c0f772e16 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -57,6 +57,8 @@ class LLDB_API SBDebugger {
 
   static const char *GetBroadcasterClass();
 
+  static bool SupportsLanguage(lldb::LanguageType language);
+
   lldb::SBBroadcaster GetBroadcaster();
 
   /// Get progress data from a SBEvent whose type is eBroadcastBitProgress.
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index b4025c173a1861..7d48f9b316138c 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -209,6 +209,7 @@ class TypeSystem : public PluginInterface,
   // TypeSystems can support more than one language
   virtual bool SupportsLanguage(lldb::LanguageType language) = 0;
 
+  static bool SupportsLanguageStatic(lldb::LanguageType language);
   // Type Completion
 
   virtual bool GetCompleteType(lldb::opaque_compiler_type_t type) = 0;
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 7ef0d6efd4aaa5..29da7d33dd80b8 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -1742,3 +1742,7 @@ bool SBDebugger::InterruptRequested()   {
     return m_opaque_sp->InterruptRequested();
   return false;
 }
+
+bool SBDebugger::SupportsLanguage(lldb::LanguageType language) {
+  return TypeSystem::SupportsLanguageStatic(language);
+}
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 4956f10a0b0a73..5d56d9b1829dac 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -335,3 +335,14 @@ TypeSystemMap::GetTypeSystemForLanguage(lldb::LanguageType language,
   }
   return GetTypeSystemForLanguage(language);
 }
+
+bool TypeSystem::SupportsLanguageStatic(lldb::LanguageType language) {
+  if (language == eLanguageTypeUnknown)
+    return false;
+
+  LanguageSet languages =
+      PluginManager::GetAllTypeSystemSupportedLanguagesForTypes();
+  if (languages.Empty())
+    return false;
+  return languages[language];
+}
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index d419f821999e6c..807d27c2c869d9 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -32,14 +32,7 @@ namespace lldb_dap {
 DAP g_dap;
 
 DAP::DAP()
-    : broadcaster("lldb-dap"),
-      exception_breakpoints(
-          {{"cpp_catch", "C++ Catch", lldb::eLanguageTypeC_plus_plus},
-           {"cpp_throw", "C++ Throw", lldb::eLanguageTypeC_plus_plus},
-           {"objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC},
-           {"objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC},
-           {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift},
-           {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}),
+    : broadcaster("lldb-dap"), exception_breakpoints(),
       focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false),
       enable_auto_variable_summaries(false),
       enable_synthetic_child_debugging(false),
@@ -65,8 +58,32 @@ DAP::DAP()
 
 DAP::~DAP() = default;
 
+void DAP::PopulateExceptionBreakpoints() {
+  exception_breakpoints = {};
+  if (debugger.SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
+    exception_breakpoints->emplace_back("cpp_catch", "C++ Catch",
+                                        lldb::eLanguageTypeC_plus_plus);
+    exception_breakpoints->emplace_back("cpp_throw", "C++ Throw",
+                                        lldb::eLanguageTypeC_plus_plus);
+  }
+  if (debugger.SupportsLanguage(lldb::eLanguageTypeObjC)) {
+    exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch",
+                                        lldb::eLanguageTypeObjC);
+    exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw",
+                                        lldb::eLanguageTypeObjC);
+  }
+  if (debugger.SupportsLanguage(lldb::eLanguageTypeSwift)) {
+    exception_breakpoints->emplace_back("swift_catch", "Swift Catch",
+                                        lldb::eLanguageTypeSwift);
+    exception_breakpoints->emplace_back("swift_throw", "Swift Throw",
+                                        lldb::eLanguageTypeSwift);
+  }
+}
+
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
-  for (auto &bp : exception_breakpoints) {
+  assert(exception_breakpoints.has_value() &&
+         "PopulateExceptionBreakpoints must be called first");
+  for (auto &bp : *exception_breakpoints) {
     if (bp.filter == filter)
       return &bp;
   }
@@ -74,7 +91,9 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  for (auto &bp : exception_breakpoints) {
+  assert(exception_breakpoints.has_value() &&
+         "PopulateExceptionBreakpoints must be called first");
+  for (auto &bp : *exception_breakpoints) {
     if (bp.bp.GetID() == bp_id)
       return &bp;
   }
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index a88ee3e1dec6bc..d114b886a15970 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -156,7 +156,7 @@ struct DAP {
   std::unique_ptr<std::ofstream> log;
   llvm::StringMap<SourceBreakpointMap> source_breakpoints;
   FunctionBreakpointMap function_breakpoints;
-  std::vector<ExceptionBreakpoint> exception_breakpoints;
+  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
   std::vector<std::string> init_commands;
   std::vector<std::string> pre_run_commands;
   std::vector<std::string> post_run_commands;
@@ -228,6 +228,8 @@ struct DAP {
 
   llvm::json::Value CreateTopLevelScopes();
 
+  void PopulateExceptionBreakpoints();
+
   /// \return
   ///   Attempt to determine if an expression is a variable expression or
   ///   lldb command using a hueristic based on the first term of the
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 7746afb6cbbf38..470c9f84c6a203 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -16,6 +16,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <optional>
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(_WIN32)
@@ -1586,6 +1587,7 @@ void request_initialize(const llvm::json::Object &request) {
   bool source_init_file = GetBoolean(arguments, "sourceInitFile", true);
 
   g_dap.debugger = lldb::SBDebugger::Create(source_init_file, log_cb, nullptr);
+  g_dap.PopulateExceptionBreakpoints();
   auto cmd = g_dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) {
@@ -1621,7 +1623,7 @@ void request_initialize(const llvm::json::Object &request) {
   body.try_emplace("supportsEvaluateForHovers", true);
   // Available filters or options for the setExceptionBreakpoints request.
   llvm::json::Array filters;
-  for (const auto &exc_bp : g_dap.exception_breakpoints) {
+  for (const auto &exc_bp : *g_dap.exception_breakpoints) {
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   }
   body.try_emplace("exceptionBreakpointFilters", std::move(filters));
@@ -2476,7 +2478,7 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
   std::set<std::string> unset_filters;
-  for (const auto &bp : g_dap.exception_breakpoints)
+  for (const auto &bp : *g_dap.exception_breakpoints)
     unset_filters.insert(bp.filter);
 
   for (const auto &value : *filters) {

From 2ceec68e1630b40a37448c44fea63f9114848235 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 09:24:16 -0700
Subject: [PATCH 146/230] [ValueTypes] Rename FlagVT to Glue in ValueTypes.td.
 NFC

Nothing ever refers to it as FlagVT so we can just use the LLVMName
"Glue".
---
 llvm/include/llvm/CodeGen/ValueTypes.td | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index a6981b0ffa13c2..963b6a71de3801 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -274,9 +274,7 @@ def nxv4f64  : VTScalableVec<4,  f64, 188>;  // n x  4 x  f64 vector value
 def nxv8f64  : VTScalableVec<8,  f64, 189>;  // n x  8 x  f64 vector value
 
 def x86mmx    : ValueType<64,   190>;  // X86 MMX value
-def FlagVT    : ValueType<0,    191> { // Pre-RA sched glue
-  let LLVMName = "Glue";
-}
+def Glue      : ValueType<0,    191>;  // Pre-RA sched glue
 def isVoid    : ValueType<0,    192>;  // Produces no value
 def untyped   : ValueType<8,    193> { // Produces an untyped value
   let LLVMName = "Untyped";

From 949ef57dd20f8d3f3257376b91af71ab8c380338 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 29 May 2024 12:52:34 -0400
Subject: [PATCH 147/230] AMDGPU/NFC: Reserve 0x058 EF_AMDGPU_MACHs (#93696)

---
 llvm/docs/AMDGPUUsage.rst            | 1 +
 llvm/include/llvm/BinaryFormat/ELF.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index b827524e6b8db4..95b54548f4fa8a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1972,6 +1972,7 @@ The AMDGPU backend uses the following ELF header:
      *reserved*                                 0x055      Reserved.
      *reserved*                                 0x056      Reserved.
      *reserved*                                 0x057      Reserved.
+     *reserved*                                 0x058      Reserved.
      ========================================== ========== =============================
 
 Sections
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 67cacaed2e12e0..9a538252d9beff 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -798,6 +798,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X58 = 0x058,
   // clang-format on
 
   // First/last AMDGCN-based processors.

From 4e251e7cad6c27b7476edd8e1dc4b98d5a8efe76 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 17:57:23 +0100
Subject: [PATCH 148/230] Fix MSVC "result of 32-bit shift implicitly converted
 to 64 bits" warning. NFC.

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9208b096affad9..6f0cae2edab17f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8013,7 +8013,7 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
         Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt);
       }
 
-      auto Mask = MIRBuilder.buildConstant(Ty, 1U << J);
+      auto Mask = MIRBuilder.buildConstant(Ty, 1ULL << J);
       Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask);
       if (I == 0)
         Tmp = Tmp2;

From 2665b2a6ddb1625799536c45ca15605a6f24c081 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 29 May 2024 18:05:41 +0100
Subject: [PATCH 149/230] [X86] Pull out combineConstantPoolLoads helper from
 combineLoad. NFC.

The logic is already pretty dense and a future patch will further complicate this.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 137 +++++++++++++++---------
 1 file changed, 86 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2d8343ffa1a0b3..24340e135b08b9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -50823,10 +50823,83 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget &Subtarget) {
+  auto *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue Chain = Ld->getChain();
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
+    return SDValue();
+
+  if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
+    return SDValue();
+
+  auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
+                         ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
+    for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
+      if (Undefs[I])
+        continue;
+      if (UserUndefs[I] || Bits[I] != UserBits[I])
+        return false;
+    }
+    return true;
+  };
+
+  // Look through all other loads/broadcasts in the chain for another constant
+  // pool entry.
+  for (SDNode *User : Chain->uses()) {
+    auto *UserLd = dyn_cast<MemSDNode>(User);
+    if (User != N && UserLd &&
+        (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
+         User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+         ISD::isNormalLoad(User)) &&
+        UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
+        User->getValueSizeInBits(0).getFixedValue() >
+            RegVT.getFixedSizeInBits()) {
+      EVT UserVT = User->getValueType(0);
+      SDValue UserPtr = UserLd->getBasePtr();
+      const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
+      const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
+
+      // See if we are loading a constant that matches in the lower
+      // bits of a longer constant (but from a different constant pool ptr).
+      if (LdC && UserC && UserPtr != Ptr) {
+        unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
+        unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
+        if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
+          APInt Undefs, UserUndefs;
+          SmallVector<APInt> Bits, UserBits;
+          unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
+                                      UserVT.getScalarSizeInBits());
+          if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
+                                            Bits) &&
+              getTargetConstantBitsFromNode(SDValue(User, 0), NumBits,
+                                            UserUndefs, UserBits)) {
+            if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
+              SDValue Extract = extractSubVector(
+                  SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
+              Extract = DAG.getBitcast(RegVT, Extract);
+              return DCI.CombineTo(N, Extract, SDValue(User, 1));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
-  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  auto *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
   SDLoc dl(Ld);
@@ -50885,7 +50958,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // If we also load/broadcast this to a wider type, then just extract the
+  // If we also broadcast this vector to a wider type, then just extract the
   // lowest subvector.
   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
@@ -50894,61 +50967,23 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     for (SDNode *User : Chain->uses()) {
       auto *UserLd = dyn_cast<MemSDNode>(User);
       if (User != N && UserLd &&
-          (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
-           User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
-           ISD::isNormalLoad(User)) &&
-          UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
+          User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+          UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
+          UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
+          !User->hasAnyUseOfValue(1) &&
           User->getValueSizeInBits(0).getFixedValue() >
               RegVT.getFixedSizeInBits()) {
-        if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
-            UserLd->getBasePtr() == Ptr &&
-            UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
-          SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
-                                             RegVT.getSizeInBits());
-          Extract = DAG.getBitcast(RegVT, Extract);
-          return DCI.CombineTo(N, Extract, SDValue(User, 1));
-        }
-        auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
-                               ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
-          for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
-            if (Undefs[I])
-              continue;
-            if (UserUndefs[I] || Bits[I] != UserBits[I])
-              return false;
-          }
-          return true;
-        };
-        // See if we are loading a constant that matches in the lower
-        // bits of a longer constant (but from a different constant pool ptr).
-        EVT UserVT = User->getValueType(0);
-        SDValue UserPtr = UserLd->getBasePtr();
-        const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
-        const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
-        if (LdC && UserC && UserPtr != Ptr) {
-          unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
-          unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
-          if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
-            APInt Undefs, UserUndefs;
-            SmallVector<APInt> Bits, UserBits;
-            unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
-                                        UserVT.getScalarSizeInBits());
-            if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
-                                              Bits) &&
-                getTargetConstantBitsFromNode(SDValue(User, 0), NumBits,
-                                              UserUndefs, UserBits)) {
-              if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
-                SDValue Extract = extractSubVector(
-                    SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
-                Extract = DAG.getBitcast(RegVT, Extract);
-                return DCI.CombineTo(N, Extract, SDValue(User, 1));
-              }
-            }
-          }
-        }
+        SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
+                                           RegVT.getSizeInBits());
+        Extract = DAG.getBitcast(RegVT, Extract);
+        return DCI.CombineTo(N, Extract, SDValue(User, 1));
       }
     }
   }
 
+  if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
+    return V;
+
   // Cast ptr32 and ptr64 pointers to the default address space before a load.
   unsigned AddrSpace = Ld->getAddressSpace();
   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

From 737a3018e826f5452f181a550be90b9135d8eda5 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 29 May 2024 10:15:17 -0700
Subject: [PATCH 150/230] [nfc][InstrFDO] Add Header::getIndexedProfileVersion
 and use it to decide profile version. (#93613)

This is a split of https://github.com/llvm/llvm-project/pull/93346 as
discussed.
---
 llvm/include/llvm/ProfileData/InstrProf.h |  4 ++++
 llvm/lib/ProfileData/InstrProf.cpp        | 11 ++++++++---
 llvm/lib/ProfileData/InstrProfReader.cpp  |  8 ++++----
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 2cee928b210e2e..15b9eb688e27e5 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -1211,6 +1211,10 @@ struct Header {
   // Returns the size of the header in bytes for all valid fields based on the
   // version. I.e a older version header will return a smaller size.
   size_t size() const;
+
+  // Return the indexed profile version, i.e., the least significant 32 bits
+  // in Header.Version.
+  uint64_t getIndexedProfileVersion() const;
 };
 
 // Profile summary data recorded in the profile data file in indexed
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index f9cd71b37002fe..dcf6aac8b59968 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1656,10 +1656,11 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
   // Read the version.
   H.Version = read(Buffer, offsetOf(&Header::Version));
-  if (GET_VERSION(H.Version) > IndexedInstrProf::ProfVersion::CurrentVersion)
+  if (H.getIndexedProfileVersion() >
+      IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  switch (GET_VERSION(H.Version)) {
+  switch (H.getIndexedProfileVersion()) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
@@ -1689,8 +1690,12 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
   return H;
 }
 
+uint64_t Header::getIndexedProfileVersion() const {
+  return GET_VERSION(Version);
+}
+
 size_t Header::size() const {
-  switch (GET_VERSION(Version)) {
+  switch (getIndexedProfileVersion()) {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 798236c295194a..a5ae0c6fa62444 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1328,7 +1328,7 @@ Error IndexedInstrProfReader::readHeader() {
 
   // The MemProfOffset field in the header is only valid when the format
   // version is higher than 8 (when it was introduced).
-  if (GET_VERSION(Header->Version) >= 8 &&
+  if (Header->getIndexedProfileVersion() >= 8 &&
       Header->Version & VARIANT_MASK_MEMPROF) {
     if (Error E = MemProfReader.deserialize(Start, Header->MemProfOffset))
       return E;
@@ -1336,7 +1336,7 @@ Error IndexedInstrProfReader::readHeader() {
 
   // BinaryIdOffset field in the header is only valid when the format version
   // is higher than 9 (when it was introduced).
-  if (GET_VERSION(Header->Version) >= 9) {
+  if (Header->getIndexedProfileVersion() >= 9) {
     const unsigned char *Ptr = Start + Header->BinaryIdOffset;
     // Read binary ids size.
     BinaryIdsSize =
@@ -1350,7 +1350,7 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
-  if (GET_VERSION(Header->Version) >= 12) {
+  if (Header->getIndexedProfileVersion() >= 12) {
     const unsigned char *Ptr = Start + Header->VTableNamesOffset;
 
     CompressedVTableNamesLen =
@@ -1363,7 +1363,7 @@ Error IndexedInstrProfReader::readHeader() {
       return make_error<InstrProfError>(instrprof_error::truncated);
   }
 
-  if (GET_VERSION(Header->Version) >= 10 &&
+  if (Header->getIndexedProfileVersion() >= 10 &&
       Header->Version & VARIANT_MASK_TEMPORAL_PROF) {
     const unsigned char *Ptr = Start + Header->TemporalProfTracesOffset;
     const auto *PtrEnd = (const unsigned char *)DataBuffer->getBufferEnd();

From 8c5a7a1fc4890fcae50f8e8a61d5a2e2b1ebd7e5 Mon Sep 17 00:00:00 2001
From: Vadim D <36827317+vvd170501@users.noreply.github.com>
Date: Wed, 29 May 2024 20:29:57 +0300
Subject: [PATCH 151/230] [clangd] Add config option to allow detection of
 unused angled includes (#87208)

This PR adds a new `AnalyzeAngledIncludes` option to `Includes` section
of clangd config. This option enables unused include checks for all includes
that use the `<>` syntax, not just standard library includes.
---
 clang-tools-extra/clangd/Config.h             |  5 +-
 clang-tools-extra/clangd/ConfigCompile.cpp    | 60 ++++++++++++-------
 clang-tools-extra/clangd/ConfigFragment.h     |  4 ++
 clang-tools-extra/clangd/ConfigYAML.cpp       |  4 ++
 clang-tools-extra/clangd/IncludeCleaner.cpp   | 32 ++++++----
 clang-tools-extra/clangd/IncludeCleaner.h     |  4 +-
 clang-tools-extra/clangd/ParsedAST.cpp        |  3 +-
 .../clangd/unittests/ConfigCompileTests.cpp   |  6 ++
 .../clangd/unittests/ConfigYAMLTests.cpp      | 15 +++++
 .../clangd/unittests/IncludeCleanerTests.cpp  | 44 ++++++++++++++
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++
 11 files changed, 144 insertions(+), 38 deletions(-)

diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h
index 4371c80a6c5877..41143b9ebc8d27 100644
--- a/clang-tools-extra/clangd/Config.h
+++ b/clang-tools-extra/clangd/Config.h
@@ -110,10 +110,11 @@ struct Config {
     IncludesPolicy UnusedIncludes = IncludesPolicy::Strict;
     IncludesPolicy MissingIncludes = IncludesPolicy::None;
 
-    /// IncludeCleaner will not diagnose usages of these headers matched by
-    /// these regexes.
     struct {
+      /// IncludeCleaner will not diagnose usages of these headers matched by
+      /// these regexes.
       std::vector<std::function<bool(llvm::StringRef)>> IgnoreHeader;
+      bool AnalyzeAngledIncludes = false;
     } Includes;
   } Diagnostics;
 
diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index 5bb2eb4a9f803f..f32f674443ffeb 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -572,32 +572,46 @@ struct FragmentCompiler {
 #else
     static llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags;
 #endif
-    auto Filters = std::make_shared<std::vector<llvm::Regex>>();
-    for (auto &HeaderPattern : F.IgnoreHeader) {
-      // Anchor on the right.
-      std::string AnchoredPattern = "(" + *HeaderPattern + ")$";
-      llvm::Regex CompiledRegex(AnchoredPattern, Flags);
-      std::string RegexError;
-      if (!CompiledRegex.isValid(RegexError)) {
-        diag(Warning,
-             llvm::formatv("Invalid regular expression '{0}': {1}",
-                           *HeaderPattern, RegexError)
-                 .str(),
-             HeaderPattern.Range);
-        continue;
+    std::shared_ptr<std::vector<llvm::Regex>> Filters;
+    if (!F.IgnoreHeader.empty()) {
+      Filters = std::make_shared<std::vector<llvm::Regex>>();
+      for (auto &HeaderPattern : F.IgnoreHeader) {
+        // Anchor on the right.
+        std::string AnchoredPattern = "(" + *HeaderPattern + ")$";
+        llvm::Regex CompiledRegex(AnchoredPattern, Flags);
+        std::string RegexError;
+        if (!CompiledRegex.isValid(RegexError)) {
+          diag(Warning,
+               llvm::formatv("Invalid regular expression '{0}': {1}",
+                             *HeaderPattern, RegexError)
+                   .str(),
+               HeaderPattern.Range);
+          continue;
+        }
+        Filters->push_back(std::move(CompiledRegex));
       }
-      Filters->push_back(std::move(CompiledRegex));
     }
-    if (Filters->empty())
+    // Optional to override the resulting AnalyzeAngledIncludes
+    // only if it's explicitly set in the current fragment.
+    // Otherwise it's inherited from parent fragment.
+    std::optional<bool> AnalyzeAngledIncludes;
+    if (F.AnalyzeAngledIncludes.has_value())
+      AnalyzeAngledIncludes = **F.AnalyzeAngledIncludes;
+    if (!Filters && !AnalyzeAngledIncludes.has_value())
       return;
-    auto Filter = [Filters](llvm::StringRef Path) {
-      for (auto &Regex : *Filters)
-        if (Regex.match(Path))
-          return true;
-      return false;
-    };
-    Out.Apply.push_back([Filter](const Params &, Config &C) {
-      C.Diagnostics.Includes.IgnoreHeader.emplace_back(Filter);
+    Out.Apply.push_back([Filters = std::move(Filters),
+                         AnalyzeAngledIncludes](const Params &, Config &C) {
+      if (Filters) {
+        auto Filter = [Filters](llvm::StringRef Path) {
+          for (auto &Regex : *Filters)
+            if (Regex.match(Path))
+              return true;
+          return false;
+        };
+        C.Diagnostics.Includes.IgnoreHeader.emplace_back(std::move(Filter));
+      }
+      if (AnalyzeAngledIncludes.has_value())
+        C.Diagnostics.Includes.AnalyzeAngledIncludes = *AnalyzeAngledIncludes;
     });
   }
 
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 7fa61108c78a05..f3e51a9b6dbc4b 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -254,6 +254,10 @@ struct Fragment {
       /// unused or missing. These can match any suffix of the header file in
       /// question.
       std::vector<Located<std::string>> IgnoreHeader;
+
+      /// If false (default), unused system headers will be ignored.
+      /// Standard library headers are analyzed regardless of this option.
+      std::optional<Located<bool>> AnalyzeAngledIncludes;
     };
     IncludesBlock Includes;
 
diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp
index ce09af819247ae..3e9b6a07d3b325 100644
--- a/clang-tools-extra/clangd/ConfigYAML.cpp
+++ b/clang-tools-extra/clangd/ConfigYAML.cpp
@@ -169,6 +169,10 @@ class Parser {
       if (auto Values = scalarValues(N))
         F.IgnoreHeader = std::move(*Values);
     });
+    Dict.handle("AnalyzeAngledIncludes", [&](Node &N) {
+      if (auto Value = boolValue(N, "AnalyzeAngledIncludes"))
+        F.AnalyzeAngledIncludes = *Value;
+    });
     Dict.parse(N);
   }
 
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index 8e48f546d94e77..01b47679790f1d 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -68,24 +68,30 @@ bool isIgnored(llvm::StringRef HeaderPath, HeaderFilter IgnoreHeaders) {
 }
 
 bool mayConsiderUnused(const Inclusion &Inc, ParsedAST &AST,
-                       const include_cleaner::PragmaIncludes *PI) {
+                       const include_cleaner::PragmaIncludes *PI,
+                       bool AnalyzeAngledIncludes) {
   assert(Inc.HeaderID);
   auto HID = static_cast<IncludeStructure::HeaderID>(*Inc.HeaderID);
   auto FE = AST.getSourceManager().getFileManager().getFileRef(
       AST.getIncludeStructure().getRealPath(HID));
   assert(FE);
   if (FE->getDir() == AST.getPreprocessor()
-                  .getHeaderSearchInfo()
-                  .getModuleMap()
-                  .getBuiltinDir()) 
+                          .getHeaderSearchInfo()
+                          .getModuleMap()
+                          .getBuiltinDir())
     return false;
   if (PI && PI->shouldKeep(*FE))
     return false;
   // FIXME(kirillbobyrev): We currently do not support the umbrella headers.
   // System headers are likely to be standard library headers.
-  // Until we have good support for umbrella headers, don't warn about them.
-  if (Inc.Written.front() == '<')
-    return tooling::stdlib::Header::named(Inc.Written).has_value();
+  // Until we have good support for umbrella headers, don't warn about them
+  // (unless analysis is explicitly enabled).
+  if (Inc.Written.front() == '<') {
+    if (tooling::stdlib::Header::named(Inc.Written))
+      return true;
+    if (!AnalyzeAngledIncludes)
+      return false;
+  }
   if (PI) {
     // Check if main file is the public interface for a private header. If so we
     // shouldn't diagnose it as unused.
@@ -266,7 +272,8 @@ Fix fixAll(const Fix &RemoveAllUnused, const Fix &AddAllMissing) {
 
 std::vector<const Inclusion *>
 getUnused(ParsedAST &AST,
-          const llvm::DenseSet<IncludeStructure::HeaderID> &ReferencedFiles) {
+          const llvm::DenseSet<IncludeStructure::HeaderID> &ReferencedFiles,
+          bool AnalyzeAngledIncludes) {
   trace::Span Tracer("IncludeCleaner::getUnused");
   std::vector<const Inclusion *> Unused;
   for (const Inclusion &MFI : AST.getIncludeStructure().MainFileIncludes) {
@@ -275,7 +282,8 @@ getUnused(ParsedAST &AST,
     auto IncludeID = static_cast<IncludeStructure::HeaderID>(*MFI.HeaderID);
     if (ReferencedFiles.contains(IncludeID))
       continue;
-    if (!mayConsiderUnused(MFI, AST, &AST.getPragmaIncludes())) {
+    if (!mayConsiderUnused(MFI, AST, &AST.getPragmaIncludes(),
+                           AnalyzeAngledIncludes)) {
       dlog("{0} was not used, but is not eligible to be diagnosed as unused",
            MFI.Written);
       continue;
@@ -347,7 +355,8 @@ include_cleaner::Includes convertIncludes(const ParsedAST &AST) {
   return ConvertedIncludes;
 }
 
-IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST) {
+IncludeCleanerFindings
+computeIncludeCleanerFindings(ParsedAST &AST, bool AnalyzeAngledIncludes) {
   // Interaction is only polished for C/CPP.
   if (AST.getLangOpts().ObjC)
     return {};
@@ -432,7 +441,8 @@ IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST) {
            MapInfo::getHashValue(RHS.Symbol);
   });
   MissingIncludes.erase(llvm::unique(MissingIncludes), MissingIncludes.end());
-  std::vector<const Inclusion *> UnusedIncludes = getUnused(AST, Used);
+  std::vector<const Inclusion *> UnusedIncludes =
+      getUnused(AST, Used, AnalyzeAngledIncludes);
   return {std::move(UnusedIncludes), std::move(MissingIncludes)};
 }
 
diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
index 624e2116be7da3..a01146d14e3c17 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.h
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -53,7 +53,9 @@ struct IncludeCleanerFindings {
   std::vector<MissingIncludeDiagInfo> MissingIncludes;
 };
 
-IncludeCleanerFindings computeIncludeCleanerFindings(ParsedAST &AST);
+IncludeCleanerFindings
+computeIncludeCleanerFindings(ParsedAST &AST,
+                              bool AnalyzeAngledIncludes = false);
 
 using HeaderFilter = llvm::ArrayRef<std::function<bool(llvm::StringRef)>>;
 std::vector<Diag>
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 3ff759415f7c8b..2bd1fbcad2ada0 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -373,7 +373,8 @@ std::vector<Diag> getIncludeCleanerDiags(ParsedAST &AST, llvm::StringRef Code,
       Cfg.Diagnostics.UnusedIncludes == Config::IncludesPolicy::None;
   if (SuppressMissing && SuppressUnused)
     return {};
-  auto Findings = computeIncludeCleanerFindings(AST);
+  auto Findings = computeIncludeCleanerFindings(
+      AST, Cfg.Diagnostics.Includes.AnalyzeAngledIncludes);
   if (SuppressMissing)
     Findings.MissingIncludes.clear();
   if (SuppressUnused)
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index f0ffc429c0ca90..4ecfdf0184ab40 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -277,6 +277,12 @@ TEST_F(ConfigCompileTests, DiagnosticsIncludeCleaner) {
   };
   EXPECT_TRUE(HeaderFilter("foo.h"));
   EXPECT_FALSE(HeaderFilter("bar.h"));
+
+  Frag = {};
+  EXPECT_FALSE(Conf.Diagnostics.Includes.AnalyzeAngledIncludes);
+  Frag.Diagnostics.Includes.AnalyzeAngledIncludes = true;
+  EXPECT_TRUE(compileAndApply());
+  EXPECT_TRUE(Conf.Diagnostics.Includes.AnalyzeAngledIncludes);
 }
 
 TEST_F(ConfigCompileTests, DiagnosticSuppression) {
diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
index 44a6647d4c0a81..10d67dead342c3 100644
--- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
@@ -278,6 +278,21 @@ TEST(ParseYAML, IncludesIgnoreHeader) {
               ElementsAre(val("foo"), val("bar")));
 }
 
+TEST(ParseYAML, IncludesAnalyzeAngledIncludes) {
+  CapturedDiags Diags;
+  Annotations YAML(R"yaml(
+Diagnostics:
+  Includes:
+    AnalyzeAngledIncludes: true
+  )yaml");
+  auto Results =
+      Fragment::parseYAML(YAML.code(), "config.yaml", Diags.callback());
+  ASSERT_THAT(Diags.Diagnostics, IsEmpty());
+  ASSERT_EQ(Results.size(), 1u);
+  EXPECT_THAT(Results[0].Diagnostics.Includes.AnalyzeAngledIncludes,
+              llvm::ValueIs(val(true)));
+}
+
 TEST(ParseYAML, Style) {
   CapturedDiags Diags;
   Annotations YAML(R"yaml(
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index 142310837bd9ce..7027232460354c 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -108,6 +108,7 @@ TEST(IncludeCleaner, GetUnusedHeaders) {
     #include "unguarded.h"
     #include "unused.h"
     #include <system_header.h>
+    #include <non_system_angled_header.h>
     void foo() {
       a();
       b();
@@ -122,6 +123,7 @@ TEST(IncludeCleaner, GetUnusedHeaders) {
   TU.AdditionalFiles["dir/c.h"] = guard("void c();");
   TU.AdditionalFiles["unused.h"] = guard("void unused();");
   TU.AdditionalFiles["dir/unused.h"] = guard("void dirUnused();");
+  TU.AdditionalFiles["dir/non_system_angled_header.h"] = guard("");
   TU.AdditionalFiles["system/system_header.h"] = guard("");
   TU.AdditionalFiles["unguarded.h"] = "";
   TU.ExtraArgs.push_back("-I" + testPath("dir"));
@@ -135,6 +137,48 @@ TEST(IncludeCleaner, GetUnusedHeaders) {
                            Pointee(writtenInclusion("\"dir/unused.h\""))));
 }
 
+TEST(IncludeCleaner, IgnoredAngledHeaders) {
+  // Currently the default behavior is to ignore unused angled includes
+  auto TU = TestTU::withCode(R"cpp(
+    #include <system_header.h>
+    #include <system_unused.h>
+    #include <non_system_angled_unused.h>
+    SystemClass x;
+  )cpp");
+  TU.AdditionalFiles["system/system_header.h"] = guard("class SystemClass {};");
+  TU.AdditionalFiles["system/system_unused.h"] = guard("");
+  TU.AdditionalFiles["dir/non_system_angled_unused.h"] = guard("");
+  TU.ExtraArgs = {
+      "-isystem" + testPath("system"),
+      "-I" + testPath("dir"),
+  };
+  auto AST = TU.build();
+  IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST);
+  EXPECT_THAT(Findings.UnusedIncludes, IsEmpty());
+}
+
+TEST(IncludeCleaner, UnusedAngledHeaders) {
+  auto TU = TestTU::withCode(R"cpp(
+    #include <system_header.h>
+    #include <system_unused.h>
+    #include <non_system_angled_unused.h>
+    SystemClass x;
+  )cpp");
+  TU.AdditionalFiles["system/system_header.h"] = guard("class SystemClass {};");
+  TU.AdditionalFiles["system/system_unused.h"] = guard("");
+  TU.AdditionalFiles["dir/non_system_angled_unused.h"] = guard("");
+  TU.ExtraArgs = {
+      "-isystem" + testPath("system"),
+      "-I" + testPath("dir"),
+  };
+  auto AST = TU.build();
+  IncludeCleanerFindings Findings = computeIncludeCleanerFindings(AST, true);
+  EXPECT_THAT(Findings.UnusedIncludes,
+              UnorderedElementsAre(
+                  Pointee(writtenInclusion("<system_unused.h>")),
+                  Pointee(writtenInclusion("<non_system_angled_unused.h>"))));
+}
+
 TEST(IncludeCleaner, ComputeMissingHeaders) {
   Annotations MainFile(R"cpp(
     #include "a.h"
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 3e3195f6f68139..a5e87d26d96c38 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -84,6 +84,11 @@ Objective-C
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Added a boolean option `AnalyzeAngledIncludes` to `Includes` config section,
+  which allows to enable unused includes detection for all angled ("system") headers.
+  At this moment umbrella headers are not supported, so enabling this option
+  may result in false-positives.
+
 Improvements to clang-doc
 -------------------------
 

From 265589785ccf043492e4e0ab88c2830eae7d3496 Mon Sep 17 00:00:00 2001
From: Miro Bucko <mbucko@meta.com>
Date: Thu, 30 May 2024 00:37:57 +0700
Subject: [PATCH 152/230] [nfc][lldb] Move FastSearch from
 CommandObjectMemoryFind to Process (#93688)

Moving CommandObjectMemoryFind::FastSearch() to Process::FindInMemory(). Plan to expose FindInMemory as public API in SBProcess.
---
 lldb/include/lldb/Target/Process.h           | 22 +++++++
 lldb/source/Commands/CommandObjectMemory.cpp | 61 +-------------------
 lldb/source/Target/Process.cpp               | 54 +++++++++++++++++
 3 files changed, 78 insertions(+), 59 deletions(-)

diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 637d34c29715c1..eec337c15f7edd 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -2663,6 +2663,28 @@ void PruneThreadPlans();
     return m_source_file_cache;
   }
 
+  /// Find a pattern within a memory region.
+  ///
+  /// This function searches for a pattern represented by the provided buffer
+  /// within the memory range specified by the low and high addresses. It uses
+  /// a bad character heuristic to optimize the search process.
+  ///
+  /// \param[in] low The starting address of the memory region to be searched.
+  /// (inclusive)
+  ///
+  /// \param[in] high The ending address of the memory region to be searched.
+  /// (exclusive)
+  ///
+  /// \param[in] buf A pointer to the buffer containing the pattern to be
+  /// searched.
+  ///
+  /// \param[in] buffer_size The size of the buffer in bytes.
+  ///
+  /// \return The address where the pattern was found or LLDB_INVALID_ADDRESS if
+  /// not found.
+  lldb::addr_t FindInMemory(lldb::addr_t low, lldb::addr_t high,
+                            const uint8_t *buf, size_t size);
+
 protected:
   friend class Trace;
 
diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index b78a0492cca558..1c13484dede648 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -977,35 +977,6 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  class ProcessMemoryIterator {
-  public:
-    ProcessMemoryIterator(ProcessSP process_sp, lldb::addr_t base)
-        : m_process_sp(process_sp), m_base_addr(base) {
-      lldbassert(process_sp.get() != nullptr);
-    }
-
-    bool IsValid() { return m_is_valid; }
-
-    uint8_t operator[](lldb::addr_t offset) {
-      if (!IsValid())
-        return 0;
-
-      uint8_t retval = 0;
-      Status error;
-      if (0 ==
-          m_process_sp->ReadMemory(m_base_addr + offset, &retval, 1, error)) {
-        m_is_valid = false;
-        return 0;
-      }
-
-      return retval;
-    }
-
-  private:
-    ProcessSP m_process_sp;
-    lldb::addr_t m_base_addr;
-    bool m_is_valid = true;
-  };
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "process" for validity as eCommandRequiresProcess
     // ensures it is valid
@@ -1106,8 +1077,8 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
     found_location = low_addr;
     bool ever_found = false;
     while (count) {
-      found_location = FastSearch(found_location, high_addr, buffer.GetBytes(),
-                                  buffer.GetByteSize());
+      found_location = process->FindInMemory(
+          found_location, high_addr, buffer.GetBytes(), buffer.GetByteSize());
       if (found_location == LLDB_INVALID_ADDRESS) {
         if (!ever_found) {
           result.AppendMessage("data not found within the range.\n");
@@ -1144,34 +1115,6 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
     result.SetStatus(lldb::eReturnStatusSuccessFinishResult);
   }
 
-  lldb::addr_t FastSearch(lldb::addr_t low, lldb::addr_t high, uint8_t *buffer,
-                          size_t buffer_size) {
-    const size_t region_size = high - low;
-
-    if (region_size < buffer_size)
-      return LLDB_INVALID_ADDRESS;
-
-    std::vector<size_t> bad_char_heuristic(256, buffer_size);
-    ProcessSP process_sp = m_exe_ctx.GetProcessSP();
-    ProcessMemoryIterator iterator(process_sp, low);
-
-    for (size_t idx = 0; idx < buffer_size - 1; idx++) {
-      decltype(bad_char_heuristic)::size_type bcu_idx = buffer[idx];
-      bad_char_heuristic[bcu_idx] = buffer_size - idx - 1;
-    }
-    for (size_t s = 0; s <= (region_size - buffer_size);) {
-      int64_t j = buffer_size - 1;
-      while (j >= 0 && buffer[j] == iterator[s + j])
-        j--;
-      if (j < 0)
-        return low + s;
-      else
-        s += bad_char_heuristic[iterator[s + buffer_size - 1]];
-    }
-
-    return LLDB_INVALID_ADDRESS;
-  }
-
   OptionGroupOptions m_option_group;
   OptionGroupFindMemory m_memory_options;
   OptionGroupMemoryTag m_memory_tag_options;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 216d2f21abfef0..1e321f8bde3919 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -112,6 +112,33 @@ class ProcessOptionValueProperties
   }
 };
 
+class ProcessMemoryIterator {
+public:
+  ProcessMemoryIterator(Process &process, lldb::addr_t base)
+      : m_process(process), m_base_addr(base) {}
+
+  bool IsValid() { return m_is_valid; }
+
+  uint8_t operator[](lldb::addr_t offset) {
+    if (!IsValid())
+      return 0;
+
+    uint8_t retval = 0;
+    Status error;
+    if (0 == m_process.ReadMemory(m_base_addr + offset, &retval, 1, error)) {
+      m_is_valid = false;
+      return 0;
+    }
+
+    return retval;
+  }
+
+private:
+  Process &m_process;
+  const lldb::addr_t m_base_addr;
+  bool m_is_valid = true;
+};
+
 static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = {
     {
         eFollowParent,
@@ -3191,6 +3218,33 @@ Status Process::Halt(bool clear_thread_plans, bool use_run_lock) {
   return Status();
 }
 
+lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high,
+                                   const uint8_t *buf, size_t size) {
+  const size_t region_size = high - low;
+
+  if (region_size < size)
+    return LLDB_INVALID_ADDRESS;
+
+  std::vector<size_t> bad_char_heuristic(256, size);
+  ProcessMemoryIterator iterator(*this, low);
+
+  for (size_t idx = 0; idx < size - 1; idx++) {
+    decltype(bad_char_heuristic)::size_type bcu_idx = buf[idx];
+    bad_char_heuristic[bcu_idx] = size - idx - 1;
+  }
+  for (size_t s = 0; s <= (region_size - size);) {
+    int64_t j = size - 1;
+    while (j >= 0 && buf[j] == iterator[s + j])
+      j--;
+    if (j < 0)
+      return low + s;
+    else
+      s += bad_char_heuristic[iterator[s + size - 1]];
+  }
+
+  return LLDB_INVALID_ADDRESS;
+}
+
 Status Process::StopForDestroyOrDetach(lldb::EventSP &exit_event_sp) {
   Status error;
 

From 9595eb10ae9a5661a596dff19bf39365140548e3 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Wed, 29 May 2024 18:46:39 +0100
Subject: [PATCH 153/230] [libc++][test] Close LWG3018 and add tests (#93047)

---
 libcxx/docs/Status/Cxx20Issues.csv            |  2 +-
 .../pointer_deleter.pass.cpp                  | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 54517ab002b86b..e748ff6ad749b7 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -191,7 +191,7 @@
 "","","","","",""
 "`1203 <https://wg21.link/LWG1203>`__","More useful rvalue stream insertion","Prague","|Complete|","12.0"
 "`2859 <https://wg21.link/LWG2859>`__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","Prague","",""
-"`3018 <https://wg21.link/LWG3018>`__","``shared_ptr``\  of function type","Prague","",""
+"`3018 <https://wg21.link/LWG3018>`__","``shared_ptr``\  of function type","Prague","|Nothing To Do|",""
 "`3050 <https://wg21.link/LWG3050>`__","Conversion specification problem in ``chrono::duration``\  constructor","Prague","|Complete|","19.0","|chrono|"
 "`3141 <https://wg21.link/LWG3141>`__","``CopyConstructible``\  doesn't preserve source values","Prague","|Nothing to do|",""
 "`3150 <https://wg21.link/LWG3150>`__","``UniformRandomBitGenerator``\  should validate ``min``\  and ``max``\ ","Prague","|Complete|","13.0","|ranges|"
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
index 9c1e9b72be573c..562acf56d96fe1 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
@@ -48,6 +48,27 @@ static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter>
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int> >::value, "");
 #endif
 
+int f() { return 5; }
+
+// https://cplusplus.github.io/LWG/issue3018
+// LWG 3018. shared_ptr of function type
+struct function_pointer_deleter {
+  function_pointer_deleter(bool& deleter_called) : deleter_called_(deleter_called) {}
+
+  void operator()(int (*)()) const { deleter_called_ = true; }
+
+  bool& deleter_called_;
+};
+
+void test_function_type() {
+  bool deleter_called = false;
+  {
+    std::shared_ptr<int()> p(&f, function_pointer_deleter(deleter_called));
+    assert((*p)() == 5);
+  }
+  assert(deleter_called);
+}
+
 int main(int, char**)
 {
     {
@@ -94,5 +115,6 @@ int main(int, char**)
     }
 #endif // TEST_STD_VER >= 11
 
+  test_function_type();
   return 0;
 }

From c54657887b2cd88f0745c151fec0b15a8a7d1e44 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 29 May 2024 10:50:44 -0700
Subject: [PATCH 154/230] [nfc][InstrProfWriter]Store header fields in a vector
 and back patch once (#93594)

This is a split of https://github.com/llvm/llvm-project/pull/93346 as
discussed.
---
 llvm/lib/ProfileData/InstrProfWriter.cpp | 62 ++++++------------------
 1 file changed, 16 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b16714ae8b9a2d..e732882337d468 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -893,52 +893,22 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   }
   InfoObj->CSSummaryBuilder = nullptr;
 
-  const size_t MemProfOffset = BackPatchStartOffset + sizeof(uint64_t);
-  const size_t BinaryIdOffset = MemProfOffset + sizeof(uint64_t);
-  const size_t TemporalProfTracesOffset = BinaryIdOffset + sizeof(uint64_t);
-  const size_t VTableNamesOffset = TemporalProfTracesOffset + sizeof(uint64_t);
-  if (!WritePrevVersion) {
-    // Now do the final patch:
-    PatchItem PatchItems[] = {
-        // Patch the Header.HashOffset field.
-        {BackPatchStartOffset, &HashTableStart, 1},
-        // Patch the Header.MemProfOffset (=0 for profiles without MemProf
-        // data).
-        {MemProfOffset, &MemProfSectionStart, 1},
-        // Patch the Header.BinaryIdSectionOffset.
-        {BinaryIdOffset, &BinaryIdSectionStart, 1},
-        // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
-        // traces).
-        {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
-        {VTableNamesOffset, &VTableNamesSectionStart, 1},
-        // Patch the summary data.
-        {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
-         (int)(SummarySize / sizeof(uint64_t))},
-        {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
-         (int)CSSummarySize}};
-
-    OS.patch(PatchItems);
-  } else {
-    // Now do the final patch:
-    PatchItem PatchItems[] = {
-        // Patch the Header.HashOffset field.
-        {BackPatchStartOffset, &HashTableStart, 1},
-        // Patch the Header.MemProfOffset (=0 for profiles without MemProf
-        // data).
-        {MemProfOffset, &MemProfSectionStart, 1},
-        // Patch the Header.BinaryIdSectionOffset.
-        {BinaryIdOffset, &BinaryIdSectionStart, 1},
-        // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
-        // traces).
-        {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
-        // Patch the summary data.
-        {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
-         (int)(SummarySize / sizeof(uint64_t))},
-        {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
-         (int)CSSummarySize}};
-
-    OS.patch(PatchItems);
-  }
+  SmallVector<uint64_t, 8> HeaderOffsets = {HashTableStart, MemProfSectionStart,
+                                            BinaryIdSectionStart,
+                                            TemporalProfTracesSectionStart};
+  if (!WritePrevVersion)
+    HeaderOffsets.push_back(VTableNamesSectionStart);
+
+  PatchItem PatchItems[] = {
+      // Patch the Header fields
+      {BackPatchStartOffset, HeaderOffsets.data(), (int)HeaderOffsets.size()},
+      // Patch the summary data.
+      {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
+       (int)(SummarySize / sizeof(uint64_t))},
+      {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
+       (int)CSSummarySize}};
+
+  OS.patch(PatchItems);
 
   for (const auto &I : FunctionData)
     for (const auto &F : I.getValue())

From 1f67f34a5cf993f03eca8936bfb7203778c2997a Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 20 Mar 2024 17:25:47 -0700
Subject: [PATCH 155/230] [MTE] add stack frame history buffer

this will allow us to find offending objects in a symbolization step,
like we can do with hwasan.

needs matching changes in AOSP:
https://android-review.git.corp.google.com/q/topic:%22stackhistorybuffer%22

Pull Request: https://github.com/llvm/llvm-project/pull/86356
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  3 +-
 .../Target/AArch64/AArch64StackTagging.cpp    | 64 ++++++++++++++++-
 .../CodeGen/AArch64/stack-tagging-prologue.ll | 69 +++++++++++++++++++
 3 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index dc7759367687b7..cd532671f50189 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2500,7 +2500,8 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return resolveFrameIndexReference(
       MF, FI, FrameReg,
       /*PreferFP=*/
-      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) ||
+          MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag),
       /*ForSimm=*/false);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index aabc5d5d22e2d3..eab3a90e57e209 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -11,6 +11,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -21,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -82,6 +84,26 @@ static cl::opt<size_t> ClMaxLifetimes(
     cl::desc("How many lifetime ends to handle for a single alloca."),
     cl::Optional);
 
+// Mode for selecting how to insert frame record info into the stack ring
+// buffer.
+enum RecordStackHistoryMode {
+  // Do not record frame record info.
+  none,
+
+  // Insert instructions into the prologue for storing into the stack ring
+  // buffer directly.
+  instr,
+};
+
+static cl::opt<RecordStackHistoryMode> ClRecordStackHistory(
+    "stack-tagging-record-stack-history",
+    cl::desc("Record stack frames with tagged allocations in a thread-local "
+             "ring buffer"),
+    cl::values(clEnumVal(none, "Do not record stack ring history"),
+               clEnumVal(instr, "Insert instructions into the prologue for "
+                                "storing into the stack ring buffer")),
+    cl::Hidden, cl::init(none));
+
 static const Align kTagGranuleSize = Align(16);
 
 namespace {
@@ -309,6 +331,7 @@ class AArch64StackTagging : public FunctionPass {
                                    uint64_t Size, InitializerBuilder &IB);
 
   Instruction *insertBaseTaggedPointer(
+      const Module &M,
       const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas,
       const DominatorTree *DT);
   bool runOnFunction(Function &F) override;
@@ -437,6 +460,7 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
 }
 
 Instruction *AArch64StackTagging::insertBaseTaggedPointer(
+    const Module &M,
     const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument,
     const DominatorTree *DT) {
   BasicBlock *PrologueBB = nullptr;
@@ -458,6 +482,41 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   Instruction *Base =
       IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
   Base->setName("basetag");
+  auto TargetTriple = Triple(M.getTargetTriple());
+  // This is not a stable ABI for now, so only allow in dev builds with API
+  // level 10000.
+  // The ThreadLong format is the same as with HWASan, but the entries for
+  // stack MTE take two slots (16 bytes).
+  if (ClRecordStackHistory == instr && TargetTriple.isAndroid() &&
+      TargetTriple.isAArch64() && !TargetTriple.isAndroidVersionLT(10000) &&
+      !AllocasToInstrument.empty()) {
+    constexpr int StackMteSlot = -3;
+    constexpr uint64_t TagMask = 0xFULL << 56;
+
+    auto *IntptrTy = IRB.getIntPtrTy(M.getDataLayout());
+    Value *SlotPtr = memtag::getAndroidSlotPtr(IRB, StackMteSlot);
+    auto *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+    Value *TaggedFP = IRB.CreateOr(
+        memtag::getFP(IRB),
+        IRB.CreateAnd(IRB.CreatePtrToInt(Base, IntptrTy), TagMask));
+    Value *PC = memtag::getPC(TargetTriple, IRB);
+    Value *RecordPtr = IRB.CreateIntToPtr(ThreadLong, IRB.getPtrTy(0));
+    IRB.CreateStore(PC, RecordPtr);
+    IRB.CreateStore(TaggedFP, IRB.CreateConstGEP1_64(IntptrTy, RecordPtr, 1));
+    // Update the ring buffer. Top byte of ThreadLong defines the size of the
+    // buffer in pages, it must be a power of two, and the start of the buffer
+    // must be aligned by twice that much. Therefore wrap around of the ring
+    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
+    // The use of AShr instead of LShr is due to
+    //   https://bugs.llvm.org/show_bug.cgi?id=39030
+    // Runtime library makes sure not to use the highest bit.
+    Value *WrapMask = IRB.CreateXor(
+        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
+        ConstantInt::get(IntptrTy, (uint64_t)-1));
+    Value *ThreadLongNew = IRB.CreateAnd(
+        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 16)), WrapMask);
+    IRB.CreateStore(ThreadLongNew, SlotPtr);
+  }
   return Base;
 }
 
@@ -513,7 +572,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   SetTagFunc =
       Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
 
-  Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT);
+  Instruction *Base =
+      insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT);
 
   int NextTag = 0;
   for (auto &I : SInfo.AllocasToInstrument) {
@@ -575,6 +635,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       for (auto *II : Info.LifetimeEnd)
         II->eraseFromParent();
     }
+
+    memtag::annotateDebugRecords(Info, static_cast<unsigned long>(Tag));
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
new file mode 100644
index 00000000000000..3f55f3cc9a2e2e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -stack-tagging-record-stack-history=instr -o - | FileCheck %s --check-prefixes=INSTR
+; RUN llc -mattr=+mte -stack-tagging-use-stack-safety=0 -stack-tagging-record-stack-history=instr %s -o - | FileCheck %s --check-prefixes=ASMINSTR
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare void @use8(ptr)
+declare void @use32(ptr)
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+
+define dso_local void @noUse32(ptr) sanitize_memtag {
+entry:
+  ret void
+}
+
+define void @OneVar() sanitize_memtag {
+entry:
+  %x = alloca i32, align 4
+  call void @use32(ptr %x)
+  ret void
+}
+
+; CHECK-LABEL: define void @OneVar(
+; CHECK:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
+; CHECK:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
+; CHECK:  ret void
+
+; INSTR-LABEL: define void @OneVar(
+; INSTR:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
+; INSTR:  [[TLS:%.*]] = call ptr @llvm.thread.pointer()
+; INSTR:  [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24
+; INSTR:  [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8
+; INSTR:  [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
+; INSTR:  [[FP_INT:%.*]] = ptrtoint ptr %3 to i64
+; INSTR:  [[BASE_INT:%.*]] = ptrtoint ptr %basetag to i64
+; INSTR:  [[BASE_TAG:%.*]] = and i64 [[BASE_INT]], 1080863910568919040
+; INSTR:  [[TAGGED_FP:%.*]] = or i64 [[FP_INT]], [[BASE_TAG]]
+; INSTR:  [[PC:%.*]] = call i64 @llvm.read_register.i64(metadata !0)
+; INSTR:  [[TLS_VALUE_PTR:%.*]] = inttoptr i64 [[TLS_VALUE]] to ptr
+; INSTR:  store i64 [[PC]], ptr [[TLS_VALUE_PTR]], align 8
+; INSTR:  [[SECOND_SLOT:%.*]] = getelementptr i64, ptr [[TLS_VALUE_PTR]], i64 1
+; INSTR:  store i64 [[TAGGED_FP]], ptr [[SECOND_SLOT]], align 8
+; INSTR:  [[SIZE_IN_PAGES:%.*]] = ashr i64 [[TLS_VALUE]], 56
+; INSTR:  [[WRAP_MASK_INTERMEDIARY:%.*]] = shl nuw nsw i64 [[SIZE_IN_PAGES]], 12
+; INSTR:  [[WRAP_MASK:%.*]] = xor i64 [[WRAP_MASK_INTERMEDIARY]], -1
+; INSTR:  [[NEXT_TLS_VALUE_BEFORE_WRAP:%.*]] = add i64 [[TLS_VALUE]], 16
+; INSTR:  [[NEXT_TLS_VALUE:%.*]] = and i64 [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[WRAP_MASK]]
+; INSTR:  store i64 [[NEXT_TLS_VALUE]], ptr [[TLS_SLOT]], align 8
+; INSTR:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; INSTR:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
+; INSTR:  [[PC:!.*]] = !{!"pc"}
+
+; ASMINSTR-LABEL: OneVar:
+; ASMINSTR:  mrs	[[TLS:x.*]], TPIDR_EL0
+; ASMINSTR:  irg	[[BASE:x.*]], sp
+; ASMINSTR:  adr	[[PC:x.*]], #0
+; ASMINSTR:  ldur	[[TLS_SLOT:x.*]], [[[TLS]], #-24]
+; ASMINSTR:  and	[[SP_TAG:x.*]], [[BASE]], #0xf00000000000000
+; ASMINSTR:  orr	[[TAGGED_FP]], x29, [[SP_TAG]]
+; ASMINSTR:  asr	[[TLS_SIZE:x.*]], [[TLS_SLOT]], #56
+; ASMINSTR:  add	[[NEXT_TLS_VALUE_BEFORE_WRAP:x.*]], [[TLS_SLOT]], #16
+; ASMINSTR:  stp	[[PC]], [[TAGGED_FP]], [[[TLS_SLOT]]]
+; ASMINSTR:  bic	[[NEXT_TLS_VALUE:x.*]], [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[TLS_SIZE]], lsl #12
+; ASMINSTR:  stur	[[NEXT_TLS_VALUE]], [[[TLS]], #-24]
+; ASMINSTR:  stg	[[BASE]], [[[BASE]]]

From 3313f28897a87ec313ec0b52ef71c14d3b9ff652 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 29 May 2024 11:21:29 -0700
Subject: [PATCH 156/230] Revert "[MTE] add stack frame history buffer"

This reverts commit 1f67f34a5cf993f03eca8936bfb7203778c2997a.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  3 +-
 .../Target/AArch64/AArch64StackTagging.cpp    | 64 +----------------
 .../CodeGen/AArch64/stack-tagging-prologue.ll | 69 -------------------
 3 files changed, 2 insertions(+), 134 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cd532671f50189..dc7759367687b7 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2500,8 +2500,7 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return resolveFrameIndexReference(
       MF, FI, FrameReg,
       /*PreferFP=*/
-      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) ||
-          MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag),
+      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
       /*ForSimm=*/false);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index eab3a90e57e209..aabc5d5d22e2d3 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -11,7 +11,6 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -22,7 +21,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
-#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -84,26 +82,6 @@ static cl::opt<size_t> ClMaxLifetimes(
     cl::desc("How many lifetime ends to handle for a single alloca."),
     cl::Optional);
 
-// Mode for selecting how to insert frame record info into the stack ring
-// buffer.
-enum RecordStackHistoryMode {
-  // Do not record frame record info.
-  none,
-
-  // Insert instructions into the prologue for storing into the stack ring
-  // buffer directly.
-  instr,
-};
-
-static cl::opt<RecordStackHistoryMode> ClRecordStackHistory(
-    "stack-tagging-record-stack-history",
-    cl::desc("Record stack frames with tagged allocations in a thread-local "
-             "ring buffer"),
-    cl::values(clEnumVal(none, "Do not record stack ring history"),
-               clEnumVal(instr, "Insert instructions into the prologue for "
-                                "storing into the stack ring buffer")),
-    cl::Hidden, cl::init(none));
-
 static const Align kTagGranuleSize = Align(16);
 
 namespace {
@@ -331,7 +309,6 @@ class AArch64StackTagging : public FunctionPass {
                                    uint64_t Size, InitializerBuilder &IB);
 
   Instruction *insertBaseTaggedPointer(
-      const Module &M,
       const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas,
       const DominatorTree *DT);
   bool runOnFunction(Function &F) override;
@@ -460,7 +437,6 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
 }
 
 Instruction *AArch64StackTagging::insertBaseTaggedPointer(
-    const Module &M,
     const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument,
     const DominatorTree *DT) {
   BasicBlock *PrologueBB = nullptr;
@@ -482,41 +458,6 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   Instruction *Base =
       IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
   Base->setName("basetag");
-  auto TargetTriple = Triple(M.getTargetTriple());
-  // This is not a stable ABI for now, so only allow in dev builds with API
-  // level 10000.
-  // The ThreadLong format is the same as with HWASan, but the entries for
-  // stack MTE take two slots (16 bytes).
-  if (ClRecordStackHistory == instr && TargetTriple.isAndroid() &&
-      TargetTriple.isAArch64() && !TargetTriple.isAndroidVersionLT(10000) &&
-      !AllocasToInstrument.empty()) {
-    constexpr int StackMteSlot = -3;
-    constexpr uint64_t TagMask = 0xFULL << 56;
-
-    auto *IntptrTy = IRB.getIntPtrTy(M.getDataLayout());
-    Value *SlotPtr = memtag::getAndroidSlotPtr(IRB, StackMteSlot);
-    auto *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
-    Value *TaggedFP = IRB.CreateOr(
-        memtag::getFP(IRB),
-        IRB.CreateAnd(IRB.CreatePtrToInt(Base, IntptrTy), TagMask));
-    Value *PC = memtag::getPC(TargetTriple, IRB);
-    Value *RecordPtr = IRB.CreateIntToPtr(ThreadLong, IRB.getPtrTy(0));
-    IRB.CreateStore(PC, RecordPtr);
-    IRB.CreateStore(TaggedFP, IRB.CreateConstGEP1_64(IntptrTy, RecordPtr, 1));
-    // Update the ring buffer. Top byte of ThreadLong defines the size of the
-    // buffer in pages, it must be a power of two, and the start of the buffer
-    // must be aligned by twice that much. Therefore wrap around of the ring
-    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
-    // The use of AShr instead of LShr is due to
-    //   https://bugs.llvm.org/show_bug.cgi?id=39030
-    // Runtime library makes sure not to use the highest bit.
-    Value *WrapMask = IRB.CreateXor(
-        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
-        ConstantInt::get(IntptrTy, (uint64_t)-1));
-    Value *ThreadLongNew = IRB.CreateAnd(
-        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 16)), WrapMask);
-    IRB.CreateStore(ThreadLongNew, SlotPtr);
-  }
   return Base;
 }
 
@@ -572,8 +513,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   SetTagFunc =
       Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
 
-  Instruction *Base =
-      insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT);
+  Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT);
 
   int NextTag = 0;
   for (auto &I : SInfo.AllocasToInstrument) {
@@ -635,8 +575,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       for (auto *II : Info.LifetimeEnd)
         II->eraseFromParent();
     }
-
-    memtag::annotateDebugRecords(Info, static_cast<unsigned long>(Tag));
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
deleted file mode 100644
index 3f55f3cc9a2e2e..00000000000000
--- a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -stack-tagging-record-stack-history=instr -o - | FileCheck %s --check-prefixes=INSTR
-; RUN llc -mattr=+mte -stack-tagging-use-stack-safety=0 -stack-tagging-record-stack-history=instr %s -o - | FileCheck %s --check-prefixes=ASMINSTR
-
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android10000"
-
-declare void @use8(ptr)
-declare void @use32(ptr)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-
-define dso_local void @noUse32(ptr) sanitize_memtag {
-entry:
-  ret void
-}
-
-define void @OneVar() sanitize_memtag {
-entry:
-  %x = alloca i32, align 4
-  call void @use32(ptr %x)
-  ret void
-}
-
-; CHECK-LABEL: define void @OneVar(
-; CHECK:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
-; CHECK:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; CHECK:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
-; CHECK:  ret void
-
-; INSTR-LABEL: define void @OneVar(
-; INSTR:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
-; INSTR:  [[TLS:%.*]] = call ptr @llvm.thread.pointer()
-; INSTR:  [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24
-; INSTR:  [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8
-; INSTR:  [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
-; INSTR:  [[FP_INT:%.*]] = ptrtoint ptr %3 to i64
-; INSTR:  [[BASE_INT:%.*]] = ptrtoint ptr %basetag to i64
-; INSTR:  [[BASE_TAG:%.*]] = and i64 [[BASE_INT]], 1080863910568919040
-; INSTR:  [[TAGGED_FP:%.*]] = or i64 [[FP_INT]], [[BASE_TAG]]
-; INSTR:  [[PC:%.*]] = call i64 @llvm.read_register.i64(metadata !0)
-; INSTR:  [[TLS_VALUE_PTR:%.*]] = inttoptr i64 [[TLS_VALUE]] to ptr
-; INSTR:  store i64 [[PC]], ptr [[TLS_VALUE_PTR]], align 8
-; INSTR:  [[SECOND_SLOT:%.*]] = getelementptr i64, ptr [[TLS_VALUE_PTR]], i64 1
-; INSTR:  store i64 [[TAGGED_FP]], ptr [[SECOND_SLOT]], align 8
-; INSTR:  [[SIZE_IN_PAGES:%.*]] = ashr i64 [[TLS_VALUE]], 56
-; INSTR:  [[WRAP_MASK_INTERMEDIARY:%.*]] = shl nuw nsw i64 [[SIZE_IN_PAGES]], 12
-; INSTR:  [[WRAP_MASK:%.*]] = xor i64 [[WRAP_MASK_INTERMEDIARY]], -1
-; INSTR:  [[NEXT_TLS_VALUE_BEFORE_WRAP:%.*]] = add i64 [[TLS_VALUE]], 16
-; INSTR:  [[NEXT_TLS_VALUE:%.*]] = and i64 [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[WRAP_MASK]]
-; INSTR:  store i64 [[NEXT_TLS_VALUE]], ptr [[TLS_SLOT]], align 8
-; INSTR:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; INSTR:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
-; INSTR:  [[PC:!.*]] = !{!"pc"}
-
-; ASMINSTR-LABEL: OneVar:
-; ASMINSTR:  mrs	[[TLS:x.*]], TPIDR_EL0
-; ASMINSTR:  irg	[[BASE:x.*]], sp
-; ASMINSTR:  adr	[[PC:x.*]], #0
-; ASMINSTR:  ldur	[[TLS_SLOT:x.*]], [[[TLS]], #-24]
-; ASMINSTR:  and	[[SP_TAG:x.*]], [[BASE]], #0xf00000000000000
-; ASMINSTR:  orr	[[TAGGED_FP]], x29, [[SP_TAG]]
-; ASMINSTR:  asr	[[TLS_SIZE:x.*]], [[TLS_SLOT]], #56
-; ASMINSTR:  add	[[NEXT_TLS_VALUE_BEFORE_WRAP:x.*]], [[TLS_SLOT]], #16
-; ASMINSTR:  stp	[[PC]], [[TAGGED_FP]], [[[TLS_SLOT]]]
-; ASMINSTR:  bic	[[NEXT_TLS_VALUE:x.*]], [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[TLS_SIZE]], lsl #12
-; ASMINSTR:  stur	[[NEXT_TLS_VALUE]], [[[TLS]], #-24]
-; ASMINSTR:  stg	[[BASE]], [[[BASE]]]

From 1a2f3309765fdc143fdc3809211fb85d2e2ca341 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 29 May 2024 15:23:44 -0300
Subject: [PATCH 157/230] [clang] Improve ast-dumper text printing of
 TemplateArgument (#93431)

This improves and unifies our approach to printing all template
arguments.

The same approach to printing types is extended to all
TemplateArguments: A sugared version is printed in quotes, followed by
printing the canonical form, unless they would print the same.

Special improvements are done to add more detail to template template
arguments.

It's planned in a future patch to use this improved TemplateName printer
for other places besides TemplateArguments.

Note: The sugared/desugared printing does not show up for TemplateNames
in tests yet, because we do a poor job of preserving their type sugar.
This will be improved in a future patch.
---
 clang/docs/ReleaseNotes.rst                   |   2 +
 clang/include/clang/AST/TextNodeDumper.h      |   2 +
 clang/lib/AST/TextNodeDumper.cpp              | 104 +++++++++++++++---
 clang/test/AST/ast-dump-decl.cpp              |  25 +++--
 ...penmp-begin-declare-variant_template_2.cpp |   6 +-
 clang/test/AST/ast-dump-template-name.cpp     |  54 +++++++++
 clang/test/AST/ast-dump-using-template.cpp    |   8 +-
 .../constraints-explicit-instantiation.cpp    |   6 +-
 clang/test/OpenMP/align_clause_ast_print.cpp  |   2 +-
 clang/test/OpenMP/generic_loop_ast_print.cpp  |   2 +-
 clang/test/OpenMP/interop_ast_print.cpp       |   2 +-
 clang/test/SemaOpenACC/sub-array-ast.cpp      |   2 +-
 .../aggregate-deduction-candidate.cpp         |  18 +--
 clang/test/SemaTemplate/attributes.cpp        |  64 +++++------
 clang/test/SemaTemplate/deduction-guide.cpp   |  19 ++--
 clang/test/SemaTemplate/make_integer_seq.cpp  |  68 +++++++-----
 clang/test/SemaTemplate/type_pack_element.cpp |  20 ++--
 17 files changed, 276 insertions(+), 128 deletions(-)
 create mode 100644 clang/test/AST/ast-dump-template-name.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bd92818f0c09d0..e1c6d55eeeacdf 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -98,6 +98,8 @@ ABI Changes in This Version
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
+- The text ast-dumper has improved printing of TemplateArguments.
+
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
 - Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 1fede6e462e925..63fa16c9ec47c1 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -213,6 +213,8 @@ class TextNodeDumper
   void dumpTemplateSpecializationKind(TemplateSpecializationKind TSK);
   void dumpNestedNameSpecifier(const NestedNameSpecifier *NNS);
   void dumpConceptReference(const ConceptReference *R);
+  void dumpTemplateArgument(const TemplateArgument &TA);
+  void dumpTemplateName(TemplateName TN);
 
   void dumpDeclRef(const Decl *D, StringRef Label = {});
 
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 4a1e94ffe283ba..627f8d3477d4e6 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -947,6 +947,26 @@ void TextNodeDumper::dumpDeclRef(const Decl *D, StringRef Label) {
   });
 }
 
+void TextNodeDumper::dumpTemplateArgument(const TemplateArgument &TA) {
+  llvm::SmallString<128> Str;
+  {
+    llvm::raw_svector_ostream SS(Str);
+    TA.print(PrintPolicy, SS, /*IncludeType=*/true);
+  }
+  OS << " '" << Str << "'";
+
+  if (TemplateArgument CanonTA = Context->getCanonicalTemplateArgument(TA);
+      !CanonTA.structurallyEquals(TA)) {
+    llvm::SmallString<128> CanonStr;
+    {
+      llvm::raw_svector_ostream SS(CanonStr);
+      CanonTA.print(PrintPolicy, SS, /*IncludeType=*/true);
+    }
+    if (CanonStr != Str)
+      OS << ":'" << CanonStr << "'";
+  }
+}
+
 const char *TextNodeDumper::getCommandName(unsigned CommandID) {
   if (Traits)
     return Traits->getCommandInfo(CommandID)->Name;
@@ -1086,45 +1106,101 @@ void TextNodeDumper::VisitNullTemplateArgument(const TemplateArgument &) {
 
 void TextNodeDumper::VisitTypeTemplateArgument(const TemplateArgument &TA) {
   OS << " type";
-  dumpType(TA.getAsType());
+  dumpTemplateArgument(TA);
 }
 
 void TextNodeDumper::VisitDeclarationTemplateArgument(
     const TemplateArgument &TA) {
   OS << " decl";
+  dumpTemplateArgument(TA);
   dumpDeclRef(TA.getAsDecl());
 }
 
-void TextNodeDumper::VisitNullPtrTemplateArgument(const TemplateArgument &) {
+void TextNodeDumper::VisitNullPtrTemplateArgument(const TemplateArgument &TA) {
   OS << " nullptr";
+  dumpTemplateArgument(TA);
 }
 
 void TextNodeDumper::VisitIntegralTemplateArgument(const TemplateArgument &TA) {
-  OS << " integral " << TA.getAsIntegral();
+  OS << " integral";
+  dumpTemplateArgument(TA);
+}
+
+void TextNodeDumper::dumpTemplateName(TemplateName TN) {
+  switch (TN.getKind()) {
+  case TemplateName::Template:
+    AddChild([=] { Visit(TN.getAsTemplateDecl()); });
+    return;
+  case TemplateName::UsingTemplate: {
+    const UsingShadowDecl *USD = TN.getAsUsingShadowDecl();
+    AddChild([=] { Visit(USD); });
+    AddChild("target", [=] { Visit(USD->getTargetDecl()); });
+    return;
+  }
+  case TemplateName::QualifiedTemplate: {
+    OS << " qualified";
+    const QualifiedTemplateName *QTN = TN.getAsQualifiedTemplateName();
+    if (QTN->hasTemplateKeyword())
+      OS << " keyword";
+    dumpNestedNameSpecifier(QTN->getQualifier());
+    dumpTemplateName(QTN->getUnderlyingTemplate());
+    return;
+  }
+  case TemplateName::DependentTemplate: {
+    OS << " dependent";
+    const DependentTemplateName *DTN = TN.getAsDependentTemplateName();
+    dumpNestedNameSpecifier(DTN->getQualifier());
+    return;
+  }
+  case TemplateName::SubstTemplateTemplateParm: {
+    OS << " subst";
+    const SubstTemplateTemplateParmStorage *STS =
+        TN.getAsSubstTemplateTemplateParm();
+    OS << " index " << STS->getIndex();
+    if (std::optional<unsigned int> PackIndex = STS->getPackIndex())
+      OS << " pack_index " << *PackIndex;
+    if (const TemplateTemplateParmDecl *P = STS->getParameter())
+      AddChild("parameter", [=] { Visit(P); });
+    dumpDeclRef(STS->getAssociatedDecl(), "associated");
+    AddChild("replacement", [=] { dumpTemplateName(STS->getReplacement()); });
+    return;
+  }
+  // FIXME: Implement these.
+  case TemplateName::OverloadedTemplate:
+    OS << " overloaded";
+    return;
+  case TemplateName::AssumedTemplate:
+    OS << " assumed";
+    return;
+  case TemplateName::SubstTemplateTemplateParmPack:
+    OS << " subst_pack";
+    return;
+  }
+  llvm_unreachable("Unexpected TemplateName Kind");
 }
 
 void TextNodeDumper::VisitTemplateTemplateArgument(const TemplateArgument &TA) {
-  if (TA.getAsTemplate().getKind() == TemplateName::UsingTemplate)
-    OS << " using";
-  OS << " template ";
-  TA.getAsTemplate().dump(OS);
+  OS << " template";
+  dumpTemplateArgument(TA);
+  dumpTemplateName(TA.getAsTemplate());
 }
 
 void TextNodeDumper::VisitTemplateExpansionTemplateArgument(
     const TemplateArgument &TA) {
-  if (TA.getAsTemplateOrTemplatePattern().getKind() ==
-      TemplateName::UsingTemplate)
-    OS << " using";
-  OS << " template expansion ";
-  TA.getAsTemplateOrTemplatePattern().dump(OS);
+  OS << " template expansion";
+  dumpTemplateArgument(TA);
+  dumpTemplateName(TA.getAsTemplateOrTemplatePattern());
 }
 
-void TextNodeDumper::VisitExpressionTemplateArgument(const TemplateArgument &) {
+void TextNodeDumper::VisitExpressionTemplateArgument(
+    const TemplateArgument &TA) {
   OS << " expr";
+  dumpTemplateArgument(TA);
 }
 
-void TextNodeDumper::VisitPackTemplateArgument(const TemplateArgument &) {
+void TextNodeDumper::VisitPackTemplateArgument(const TemplateArgument &TA) {
   OS << " pack";
+  dumpTemplateArgument(TA);
 }
 
 static void dumpBasePath(raw_ostream &OS, const CastExpr *Node) {
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index e062d4f068a403..b861ba8be15b50 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -459,21 +459,23 @@ namespace testClassTemplateDecl {
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-148]]:3, col:31> col:31 TestTemplateDefaultNonType{{$}}
 // CHECK-NEXT:  |-NonTypeTemplateParmDecl 0x{{.+}} <col:12, col:20> col:16 'int' depth 0 index 0 I{{$}}
-// CHECK-NEXT:  | `-TemplateArgument <col:20> expr{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <col:20> expr '42'{{$}}
 // CHECK-NEXT:  |   `-IntegerLiteral 0x{{.+}} <col:20> 'int' 42{{$}}
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <col:24, col:31> col:31 struct TestTemplateDefaultNonType{{$}}
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} <{{.+}}:{{.*}}:3, col:68> col:68 TestTemplateTemplateDefaultType{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl 0x{{.+}} <col:12, col:42> col:37 depth 0 index 0 TT{{$}}
 // CHECK-NEXT:  | |-TemplateTypeParmDecl 0x{{.+}} <col:21> col:29 typename depth 1 index 0{{$}}
-// CHECK-NEXT:  | `-TemplateArgument <col:42> template TestClassTemplate{{$}}
-// CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <col:61, col:68> col:68 struct TestTemplateTemplateDefaultType{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <col:42> template 'testClassTemplateDecl::TestClassTemplate'{{$}}
+// CHECK-NEXT:  |   `-ClassTemplateDecl 0x{{.+}} <line:{{.+}}:3, line:{{.+}}:3> line:{{.+}}:30 TestClassTemplate{{$}}
+// CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <line:{{.*}}:61, col:68> col:68 struct TestTemplateTemplateDefaultType{{$}}
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:{{.*}}:3, col:82> col:48 TestTemplateTemplateDefaultType{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl 0x{{.+}} <col:12, col:37> col:37 depth 0 index 0 TT{{$}}
 // CHECK-NEXT:  | |-TemplateTypeParmDecl 0x{{.+}} <col:21> col:29 typename depth 1 index 0{{$}}
-// CHECK-NEXT:  | `-TemplateArgument <line:{{.*}}:42> template TestClassTemplate{{$}}
-// CHECK-NEXT:  |   `-inherited from TemplateTemplateParm 0x{{.+}} 'TT'{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <line:{{.*}}:42> template 'testClassTemplateDecl::TestClassTemplate'{{$}}
+// CHECK-NEXT:  |   |-inherited from TemplateTemplateParm 0x{{.+}} 'TT'{{$}}
+// CHECK-NEXT:  |   `-ClassTemplateDecl 0x{{.+}} <line:{{.+}}:3, line:{{.+}}:3> line:{{.+}}:30 TestClassTemplate
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} prev 0x{{.+}} <line:{{.*}}:41, col:82> col:48 struct TestTemplateTemplateDefaultType definition{{$}}
 // CHECK-NEXT:    |-DefinitionData empty aggregate standard_layout trivially_copyable pod trivial literal has_constexpr_non_copy_move_ctor can_const_default_init{{$}}
 // CHECK-NEXT:    | |-DefaultConstructor exists trivial constexpr needs_implicit defaulted_is_constexpr{{$}}
@@ -683,7 +685,8 @@ namespace TestTemplateTemplateParmDecl {
 // CHECK:        FunctionTemplateDecl
 // CHECK-NEXT:     TemplateTemplateParmDecl{{.*}} T
 // CHECK-NEXT:       TemplateTypeParmDecl{{.*}} typename
-// CHECK-NEXT:       TemplateArgument{{.*}} template A
+// CHECK-NEXT:       TemplateArgument{{.*}} template 'TestTemplateTemplateParmDecl::A'
+// CHECK-NEXT:         ClassTemplateDecl {{.*}} A
 // CHECK-NEXT:     TemplateTemplateParmDecl{{.*}} ... U
 // CHECK-NEXT:       TemplateTypeParmDecl{{.*}} typename
 
@@ -710,12 +713,12 @@ namespace TestTemplateArgument {
   template<int> class testIntegral { };
   template class testIntegral<1>;
   // CHECK:      ClassTemplateSpecializationDecl{{.*}} class testIntegral
-  // CHECK:        TemplateArgument{{.*}} integral 1
+  // CHECK:        TemplateArgument{{.*}} integral '1'
 
   template<template<typename> class> class testTemplate { };
   template class testTemplate<A>;
   // CHECK:      ClassTemplateSpecializationDecl{{.*}} class testTemplate
-  // CHECK:        TemplateArgument{{.*}} A
+  // CHECK:        TemplateArgument{{.*}} 'TestTemplateArgument::A'
 
   template<template<typename> class ...T> class C {
     B<T...> testTemplateExpansion;
@@ -731,10 +734,10 @@ namespace TestTemplateArgument {
   template<int, int ...> class testPack { };
   template class testPack<0, 1, 2>;
   // CHECK:      ClassTemplateSpecializationDecl{{.*}} class testPack
-  // CHECK:        TemplateArgument{{.*}} integral 0
+  // CHECK:        TemplateArgument{{.*}} integral '0'
   // CHECK-NEXT:   TemplateArgument{{.*}} pack
-  // CHECK-NEXT:     TemplateArgument{{.*}} integral 1
-  // CHECK-NEXT:     TemplateArgument{{.*}} integral 2
+  // CHECK-NEXT:     TemplateArgument{{.*}} integral '1'
+  // CHECK-NEXT:     TemplateArgument{{.*}} integral '2'
 }
 
 namespace testUsingDecl {
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
index da46cef7f3f1bc..6fe05e33a5fb87 100644
--- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
@@ -79,7 +79,7 @@ int test() {
 // CHECK-NEXT: | |   `-ReturnStmt [[ADDR_22:0x[a-z0-9]*]] <line:10:3, col:10>
 // CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_23:0x[a-z0-9]*]] <col:10> 'int' 0
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_24:0x[a-z0-9]*]] <line:9:1, line:11:1> line:9:5 used also_before_mismatch 'int ({{.*}})'
-// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   |-TemplateArgument integral '0'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_25:0x[a-z0-9]*]] <col:32, line:11:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_26:0x[a-z0-9]*]] <line:10:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_23]] <col:10> 'int' 0
@@ -179,7 +179,7 @@ int test() {
 // CHECK-NEXT: | | `-OMPDeclareVariantAttr [[ADDR_101:0x[a-z0-9]*]] <<invalid sloc>> Implicit implementation={extension(allow_templates)}
 // CHECK-NEXT: | |   `-DeclRefExpr [[ADDR_102:0x[a-z0-9]*]] <col:1> 'int ({{.*}})' {{.*}}Function [[ADDR_103:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})'
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_104:0x[a-z0-9]*]] <col:1, col:18> col:5 used only_def 'int ({{.*}})'
-// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   |-TemplateArgument integral '0'
 // CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_105:0x[a-z0-9]*]] <<invalid sloc>> Implicit implementation={extension(allow_templates)}
 // CHECK-NEXT: |     `-DeclRefExpr [[ADDR_106:0x[a-z0-9]*]] <col:1> 'int ({{.*}})' {{.*}}Function [[ADDR_107:0x[a-z0-9]*]] 'only_def[implementation={extension(allow_templates)}]' 'int ({{.*}})'
 // CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_108:0x[a-z0-9]*]] <line:37:1, line:40:1> line:38:1 only_def[implementation={extension(allow_templates)}]
@@ -189,7 +189,7 @@ int test() {
 // CHECK-NEXT: | |   `-ReturnStmt [[ADDR_110:0x[a-z0-9]*]] <line:39:3, col:10>
 // CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_111:0x[a-z0-9]*]] <col:10> 'int' 0
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_107]] <line:38:1, line:40:1> line:38:1 only_def[implementation={extension(allow_templates)}] 'int ({{.*}})'
-// CHECK-NEXT: |   |-TemplateArgument integral 0
+// CHECK-NEXT: |   |-TemplateArgument integral '0'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_112:0x[a-z0-9]*]] <col:20, line:40:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_113:0x[a-z0-9]*]] <line:39:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_111]] <col:10> 'int' 0
diff --git a/clang/test/AST/ast-dump-template-name.cpp b/clang/test/AST/ast-dump-template-name.cpp
new file mode 100644
index 00000000000000..39100711b60a13
--- /dev/null
+++ b/clang/test/AST/ast-dump-template-name.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -std=c++26 -ast-dump -ast-dump-filter=Test %s | FileCheck %s
+
+template <template <class> class TT> using N = TT<int>;
+
+namespace qualified {
+  namespace foo {
+    template <class T> struct A;
+  } // namespace foo
+  using TestQualified = N<foo::A>;
+} // namespace qualified
+
+// CHECK:      Dumping qualified::TestQualified:
+// CHECK-NEXT: TypeAliasDecl
+// CHECK-NEXT: `-ElaboratedType
+// CHECK-NEXT:   `-TemplateSpecializationType
+// CHECK-NEXT:     |-TemplateArgument template 'qualified::foo::A' qualified{{$}}
+// CHECK-NEXT:     | |-NestedNameSpecifier Namespace 0x{{.+}} 'foo'{{$}}
+// CHECK-NEXT:     | `-ClassTemplateDecl {{.+}} A{{$}}
+
+namespace dependent {
+  template <class T> struct B {
+    using TestDependent = N<T::template X>;
+  };
+} // namespace dependent
+
+// CHECK:      Dumping dependent::B::TestDependent:
+// CHECK-NEXT: TypeAliasDecl
+// CHECK-NEXT: `-ElaboratedType
+// CHECK-NEXT:   `-TemplateSpecializationType
+// CHECK-NEXT:     |-TemplateArgument template 'template X' dependent{{$}}
+// CHECK-NEXT:     | `-NestedNameSpecifier TypeSpec 'T'{{$}}
+
+namespace subst {
+  template <class> struct A;
+
+  template <template <class> class TT> struct B {
+    template <template <class> class> struct C {};
+    using type = C<TT>;
+  };
+  using TestSubst = B<A>::type;
+} // namespace subst
+
+// CHECK:      Dumping subst::TestSubst:
+// CHECK-NEXT: TypeAliasDecl
+// CHECK-NEXT: `-ElaboratedType
+// CHECK-NEXT:   `-TypedefType
+// CHECK-NEXT:     |-TypeAlias
+// CHECK-NEXT:     `-ElaboratedType
+// CHECK-NEXT:       `-TemplateSpecializationType
+// CHECK-NEXT:         |-TemplateArgument template 'subst::A' subst index 0
+// CHECK-NEXT:         | |-parameter: TemplateTemplateParmDecl {{.+}} depth 0 index 0 TT{{$}}
+// CHECK-NEXT:         | |-associated ClassTemplateSpecialization {{.+}} 'B'{{$}}
+// CHECK-NEXT:         | `-replacement:
+// CHECK-NEXT:         |   `-ClassTemplateDecl {{.+}} A{{$}}
diff --git a/clang/test/AST/ast-dump-using-template.cpp b/clang/test/AST/ast-dump-using-template.cpp
index de3ce277fd24f4..69b199fd0606c1 100644
--- a/clang/test/AST/ast-dump-using-template.cpp
+++ b/clang/test/AST/ast-dump-using-template.cpp
@@ -28,9 +28,11 @@ using B = X<S>;
 // CHECK:      TypeAliasDecl
 // CHECK-NEXT: `-ElaboratedType {{.*}} 'X<ns::S>' sugar
 // CHECK-NEXT:   `-TemplateSpecializationType {{.*}} 'X<ns::S>' sugar X
-// CHECK-NEXT:     |-TemplateArgument using template S
-// CHECK-NEXT:       `-RecordType {{.*}} 'X<ns::S>'
-// CHECK-NEXT:         `-ClassTemplateSpecialization {{.*}} 'X'
+// CHECK-NEXT:     |-TemplateArgument template 'ns::S'
+// CHECK-NEXT:     | |-UsingShadowDecl {{.*}} implicit ClassTemplate {{.*}} 'S'
+// CHECK-NEXT:     | `-target: ClassTemplateDecl {{.*}} S
+// CHECK-NEXT:     `-RecordType {{.*}} 'X<ns::S>'
+// CHECK-NEXT:       `-ClassTemplateSpecialization {{.*}} 'X'
 
 // TemplateName in DeducedTemplateSpecializationType.
 S DeducedTemplateSpecializationT(123);
diff --git a/clang/test/AST/constraints-explicit-instantiation.cpp b/clang/test/AST/constraints-explicit-instantiation.cpp
index 10b6432f2db8c2..79948ad1e85567 100644
--- a/clang/test/AST/constraints-explicit-instantiation.cpp
+++ b/clang/test/AST/constraints-explicit-instantiation.cpp
@@ -21,17 +21,17 @@ struct A {
 
 // This checks that `canary1<1>` and `canaray2<2>` are instantiated, thus
 // indirectly validating that the correct candidates of `A::f` were really
-// instantiated each time. 
+// instantiated each time.
 // The `static_assert`s validate we don't instantiate wrong candidates.
 
 // CHECK:{{.*}}FunctionTemplateDecl {{.*}} canary1
 // CHECK:      {{.*}}TemplateArgument integral
-// CHECK-SAME: {{1$}}
+// CHECK-SAME: {{'1'$}}
 template struct A<1>;
 
 // CHECK:      {{.*}}FunctionTemplateDecl {{.*}} canary2
 // CHECK:      {{.*}}TemplateArgument integral
-// CHECK-SAME: {{2$}}
+// CHECK-SAME: {{'2'$}}
 template struct A<2>;
 
 template struct A<3>;
diff --git a/clang/test/OpenMP/align_clause_ast_print.cpp b/clang/test/OpenMP/align_clause_ast_print.cpp
index 87000f9c41bae4..c5e27a5d21d020 100644
--- a/clang/test/OpenMP/align_clause_ast_print.cpp
+++ b/clang/test/OpenMP/align_clause_ast_print.cpp
@@ -114,7 +114,7 @@ int template_test() {
 // DUMP: FunctionDecl {{.*}}run 'double ()'
 // DUMP: TemplateArgument type 'double'
 // DUMP: BuiltinType {{.*}}'double'
-// DUMP: TemplateArgument integral 1
+// DUMP: TemplateArgument integral '1U'
 // DUMP: OMPAllocateDeclAttr {{.*}}Implicit OMPNullMemAlloc
 // DUMP: ConstantExpr {{.*}}'unsigned int'
 // DUMP: value: Int 1
diff --git a/clang/test/OpenMP/generic_loop_ast_print.cpp b/clang/test/OpenMP/generic_loop_ast_print.cpp
index df806405571cf7..b61ee79615d047 100644
--- a/clang/test/OpenMP/generic_loop_ast_print.cpp
+++ b/clang/test/OpenMP/generic_loop_ast_print.cpp
@@ -50,7 +50,7 @@
 //PRINT: }
 //DUMP: FunctionDecl{{.*}}templ_foo 'void (int)'
 //DUMP: TemplateArgument type 'int'
-//DUMP: TemplateArgument integral 2
+//DUMP: TemplateArgument integral '2'
 //DUMP: ParmVarDecl{{.*}}'int'
 //DUMP: OMPSimdDirective
 //DUMP: OMPCollapseClause
diff --git a/clang/test/OpenMP/interop_ast_print.cpp b/clang/test/OpenMP/interop_ast_print.cpp
index 7b9dda577c8403..fed6febc63085e 100644
--- a/clang/test/OpenMP/interop_ast_print.cpp
+++ b/clang/test/OpenMP/interop_ast_print.cpp
@@ -268,7 +268,7 @@ void fooTemp() {
 
   //PRINT: #pragma omp interop init(prefer_type(3,4,"level_one"), target : interop_var)
   //DUMP: FunctionDecl{{.*}}fooTemp
-  //DUMP: TemplateArgument integral 3
+  //DUMP: TemplateArgument integral '3'
   //DUMP: OMPInteropDirective
   //DUMP: OMPInitClause
   //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}'interop_var'
diff --git a/clang/test/SemaOpenACC/sub-array-ast.cpp b/clang/test/SemaOpenACC/sub-array-ast.cpp
index 094976e1642752..43cc55a3f9a510 100644
--- a/clang/test/SemaOpenACC/sub-array-ast.cpp
+++ b/clang/test/SemaOpenACC/sub-array-ast.cpp
@@ -357,7 +357,7 @@ void Templ(int i){
   // CHECK-NEXT: FunctionDecl{{.*}} Templ 'void (int)' implicit_instantiation
   // CHECK-NEXT: TemplateArgument{{.*}} 'int'
   // CHECK-NEXT: BuiltinType{{.*}} 'int'
-  // CHECK-NEXT: TemplateArgument integral 3
+  // CHECK-NEXT: TemplateArgument integral '3U'
   // CHECK-NEXT: TemplateArgument decl
   // CHECK-NEXT: Var{{.*}} 'CEArray' 'const int[5]'
   // CHECK-NEXT: ParmVarDecl{{.*}} i 'int'
diff --git a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
index 7f535651bb8157..db72783b99c389 100644
--- a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
+++ b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
@@ -58,7 +58,7 @@ namespace Basic {
   D d2 = {1, 2, 3}; // cxx17-error {{no viable}}
 
   D d3(1, 2); // expected-error {{no viable}}
-  // CTAD succeed but brace elision is not allowed for parenthesized aggregate init. 
+  // CTAD succeed but brace elision is not allowed for parenthesized aggregate init.
   D d4(1, 2, 3); // expected-error {{no viable}}
 
   // CHECK-LABEL: Dumping Basic::<deduction guide for C>:
@@ -160,7 +160,7 @@ namespace Basic {
 }
 
 namespace Array {
-  typedef __SIZE_TYPE__ size_t;
+  typedef unsigned long size_t;
   template <typename T, size_t N> struct A { // cxx20-note 2 {{candidate}} cxx17-note 14 {{candidate}}
     T array[N];
   };
@@ -183,7 +183,7 @@ namespace Array {
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A> 'auto (int (&&)[3]) -> Array::A<int, 3>'
   // CHECK:   |-TemplateArgument type 'int'
   // CHECK:   | `-BuiltinType {{.*}} 'int'
-  // CHECK:   |-TemplateArgument integral 3
+  // CHECK:   |-TemplateArgument integral '3UL'
   // CHECK:   `-ParmVarDecl {{.*}} 'int (&&)[3]'
   // CHECK: FunctionProtoType {{.*}} 'auto (T (&&)[N]) -> A<T, N>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'A<T, N>' dependent
@@ -203,7 +203,7 @@ namespace Array {
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A> 'auto (const char (&)[5]) -> Array::A<char, 5>'
   // CHECK:   |-TemplateArgument type 'char'
   // CHECK:   | `-BuiltinType {{.*}} 'char'
-  // CHECK:   |-TemplateArgument integral 5
+  // CHECK:   |-TemplateArgument integral '5UL'
   // CHECK:   `-ParmVarDecl {{.*}} 'const char (&)[5]'
   // CHECK: FunctionProtoType {{.*}} 'auto (const T (&)[N]) -> A<T, N>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'A<T, N>' dependent
@@ -223,7 +223,7 @@ namespace BraceElision {
 
   A a1 = {0, 1}; // cxx17-error {{no viable}}
 
-  // CTAD succeed but brace elision is not allowed for parenthesized aggregate init. 
+  // CTAD succeed but brace elision is not allowed for parenthesized aggregate init.
   A a2(0, 1); // cxx20-error {{array initializer must be an initializer list}} cxx17-error {{no viable}}
 
   // CHECK-LABEL: Dumping BraceElision::<deduction guide for A>:
@@ -265,8 +265,8 @@ namespace TrailingPack {
   // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 ... T
   // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for A> 'auto (T...) -> A<T...>'
   // CHECK: | `-ParmVarDecl {{.*}} 'T...' pack
-  // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A> 
-  // CHECK-SAME: 'auto (TrailingPack::(lambda at {{.*}}), TrailingPack::(lambda at {{.*}})) -> 
+  // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A>
+  // CHECK-SAME: 'auto (TrailingPack::(lambda at {{.*}}), TrailingPack::(lambda at {{.*}})) ->
   // CHECK-SAME:     TrailingPack::A<TrailingPack::(lambda at {{.*}}), TrailingPack::(lambda at {{.*}})>'
   // CHECK: |-TemplateArgument pack
   // CHECK: | |-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
@@ -326,8 +326,8 @@ namespace DeduceArity {
   // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (Types<T...>, T...) -> F<T...>'
   // CHECK: | |-ParmVarDecl {{.*}} 'Types<T...>'
   // CHECK: | `-ParmVarDecl {{.*}} 'T...' pack
-  // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for F> 
-  // CHECK-SAME: 'auto (Types<X, Y, Z>, DeduceArity::X, DeduceArity::Y, DeduceArity::Z) -> 
+  // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for F>
+  // CHECK-SAME: 'auto (Types<X, Y, Z>, DeduceArity::X, DeduceArity::Y, DeduceArity::Z) ->
   // CHECK-SAME:     DeduceArity::F<DeduceArity::X, DeduceArity::Y, DeduceArity::Z>'
   // CHECK: | |-TemplateArgument pack
   // CHECK: | | |-TemplateArgument type 'DeduceArity::X'
diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp
index 9fd448a5e9353e..f6c9f13f0842d2 100644
--- a/clang/test/SemaTemplate/attributes.cpp
+++ b/clang/test/SemaTemplate/attributes.cpp
@@ -33,11 +33,11 @@ namespace attribute_aligned {
       static_assert(sizeof(t) == sizeof(T), "my_aligned_storage size wrong");
       static_assert(alignof(t) == alignof(T), "my_aligned_storage align wrong"); // expected-warning{{'alignof' applied to an expression is a GNU extension}}
     }
-    
+
   private:
     my_aligned_storage<sizeof(T), alignof(T)> t;
   };
-  
+
   C<double> cd;
 }
 
@@ -73,9 +73,9 @@ void UseAnnotations() { HasAnnotations<int>(); }
 // CHECK-NEXT:         DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
 // CHECK-NEXT:   FunctionDecl {{.*}} used HasPackAnnotations 'void ()'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_BAZ"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:         value: Int 1
@@ -128,9 +128,9 @@ void UseOnlyPackAnnotations() {
 // CHECK-NEXT:     TemplateArgument{{.*}} type 'int'
 // CHECK-NEXT:       BuiltinType {{.*}} 'int'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_BOO"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:         value: Int 1
@@ -159,9 +159,9 @@ void UseOnlyPackAnnotations() {
 // CHECK-NEXT:     TemplateArgument type 'float'
 // CHECK-NEXT:       BuiltinType {{.*}} 'float'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_FOZ"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:         value: Int 4
@@ -184,9 +184,9 @@ void UseOnlyPackAnnotations() {
 // CHECK-NEXT:     TemplateArgument type 'bool'
 // CHECK-NEXT:       BuiltinType {{.*}} 'bool'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 7
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 8
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 9
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '7'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '8'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '9'
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_FOZ"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:         value: Int 7
@@ -215,9 +215,9 @@ void UseOnlyPackAnnotations() {
 // CHECK-NEXT:     TemplateArgument type 'char'
 // CHECK-NEXT:       BuiltinType {{.*}} 'char'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
 // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct
 // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct AnnotatedPackTemplateStruct definition
 // CHECK-NEXT:     DefinitionData
@@ -312,9 +312,9 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:     TemplateArgument{{.*}} type 'int'
 // CHECK-NEXT:       BuiltinType {{.*}} 'int'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_BIR"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:         value: Int 1
@@ -343,9 +343,9 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:     TemplateArgument{{.*}} type 'float'
 // CHECK-NEXT:       BuiltinType {{.*}} 'float'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
 // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct
 // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition
 // CHECK-NEXT:     DefinitionData
@@ -358,9 +358,9 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:     TemplateArgument{{.*}} type 'bool'
 // CHECK-NEXT:       BuiltinType {{.*}} 'bool'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 7
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 8
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 9
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '7'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '8'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '9'
 // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct
 // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition
 // CHECK-NEXT:     DefinitionData
@@ -420,9 +420,9 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:   TemplateArgument{{.*}} type 'char'
 // CHECK-NEXT:     BuiltinType {{.*}} 'char'
 // CHECK-NEXT:   TemplateArgument{{.*}} pack
-// CHECK-NEXT:     TemplateArgument{{.*}} integral 5
-// CHECK-NEXT:     TemplateArgument{{.*}} integral 6
-// CHECK-NEXT:     TemplateArgument{{.*}} integral 7
+// CHECK-NEXT:     TemplateArgument{{.*}} integral '5'
+// CHECK-NEXT:     TemplateArgument{{.*}} integral '6'
+// CHECK-NEXT:     TemplateArgument{{.*}} integral '7'
 // CHECK-NEXT:   CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct
 template <typename T, int... Is> struct InvalidAnnotatedPackTemplateStruct{};
 template <int... Is> struct [[clang::annotate("ANNOTATE_BIR", Is...)]] InvalidAnnotatedPackTemplateStruct<int, Is...>{};
@@ -444,9 +444,9 @@ void UseInvalidAnnotatedPackTemplateStruct() {
 // CHECK-NEXT:         DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
 // CHECK-NEXT:   FunctionDecl {{.*}} used RedeclaredAnnotatedFunc 'void ()'
 // CHECK-NEXT:     TemplateArgument{{.*}} pack
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 1
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 2
-// CHECK-NEXT:       TemplateArgument{{.*}} integral 3
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
 // CHECK-NEXT:     CompoundStmt
 // CHECK-NEXT:     AnnotateAttr {{.*}} "ANNOTATE_FAR"
 // CHECK-NEXT:       ConstantExpr {{.*}} 'int'
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 91c35d98fbf578..96b4cd9622a24f 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -29,9 +29,9 @@ using AT = A<int[3], int, int, short>;
 // CHECK:   | |-TemplateArgument type 'int'
 // CHECK:   | `-TemplateArgument type 'short'
 // CHECK:   |-TemplateArgument pack
-// CHECK:   | |-TemplateArgument integral 3
-// CHECK:   | |-TemplateArgument integral 3
-// CHECK:   | `-TemplateArgument integral 4
+// CHECK:   | |-TemplateArgument integral '3'
+// CHECK:   | |-TemplateArgument integral '3'
+// CHECK:   | `-TemplateArgument integral '(short)4'
 // CHECK:   |-TemplateArgument pack
 // CHECK:   | |-TemplateArgument decl
 // CHECK:   | | `-Var {{.*}} 'arr1' 'int[3]'
@@ -73,7 +73,7 @@ using BT = B<char, 'x'>;
 // CHECK: | `-ParmVarDecl {{.*}} 'X<W, V>'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (X<nullptr, 'x'>) -> B<char, 'x'>'
 // CHECK:   |-TemplateArgument type 'char'
-// CHECK:   |-TemplateArgument integral 120
+// CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'std::nullptr_t'
 // CHECK:   |-TemplateArgument nullptr
 // CHECK:   `-ParmVarDecl {{.*}} 'X<nullptr, 'x'>'
@@ -108,9 +108,9 @@ using CT = C<int>;
 // CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y<B>, int) -> C<int>'
 // CHECK:  |-TemplateArgument type 'int'
-// CHECK:  |-TemplateArgument template B
+// CHECK:  |-TemplateArgument template 'B'
 // CHECK:  |-TemplateArgument type 'int'
-// CHECK:  |-TemplateArgument integral 0
+// CHECK:  |-TemplateArgument integral '0'
 // CHECK:  |-ParmVarDecl {{.*}} 'int'
 // CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
 // CHECK:  `-ParmVarDecl {{.*}} 'int'
@@ -231,7 +231,7 @@ F s(0);
 // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (type-parameter-0-1) -> F<>'
 // CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-1'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
-// CHECK:   |-TemplateArgument integral 120
+// CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
@@ -320,13 +320,14 @@ namespace TTP {
 // CHECK-NEXT:  `-CXXDeductionGuideDecl {{.+}} 'auto (A<int>) -> TTP::B<int>'
 // CHECK-NEXT:    |-TemplateArgument type 'int'
 // CHECK-NEXT:    | `-BuiltinType {{.+}} 'int'{{$}}
-// CHECK-NEXT:    |-TemplateArgument template A
+// CHECK-NEXT:    |-TemplateArgument template 'TTP::A'{{$}}
+// CHECK-NEXT:    | `-ClassTemplateDecl {{.+}} A{{$}}
 // CHECK-NEXT:    `-ParmVarDecl {{.+}} 'A<int>':'TTP::A<int>'{{$}}
 // CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (<T>) -> B<T>' dependent trailing_return cdecl{{$}}
 // CHECK-NEXT:  |-InjectedClassNameType {{.+}} 'B<T>' dependent{{$}}
 // CHECK-NEXT:  | `-CXXRecord {{.+}} 'B'{{$}}
 // CHECK-NEXT:  `-ElaboratedType {{.+}} '<T>' sugar dependent{{$}}
 // CHECK-NEXT:    `-TemplateSpecializationType {{.+}} '<T>' dependent {{$}}
-// CHECK-NEXT:      `-TemplateArgument type 'T'{{$}}
+// CHECK-NEXT:      `-TemplateArgument type 'T':'type-parameter-0-0'{{$}}
 // CHECK-NEXT:        `-TemplateTypeParmType {{.+}} 'T' dependent depth 0 index 0{{$}}
 // CHECK-NEXT:          `-TemplateTypeParm {{.+}} 'T'{{$}}
diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp
index c5a1e27053689d..107ea8a25da4eb 100644
--- a/clang/test/SemaTemplate/make_integer_seq.cpp
+++ b/clang/test/SemaTemplate/make_integer_seq.cpp
@@ -6,17 +6,18 @@ using test1 = __make_integer_seq<A, int, 1>;
 //      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:5:1, col:43> col:7 test1 '__make_integer_seq<A, int, 1>':'A<int, 0>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar alias __make_integer_seq
-// CHECK-NEXT:       |-TemplateArgument template A
+// CHECK-NEXT:       |-TemplateArgument template 'A'
+// CHECK-NEXT:       | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:       |-TemplateArgument type 'int'
 // CHECK-NEXT:       | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:       |-TemplateArgument expr
-// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int'
+// CHECK-NEXT:       |-TemplateArgument expr '1'
+// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:42> 'int'
 // CHECK-NEXT:       |   |-value: Int 1
 // CHECK-NEXT:       |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:42> 'int' 1
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} 'A<int, 0>' sugar A
 // CHECK-NEXT:         |-TemplateArgument type 'int'
 // CHECK-NEXT:         | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:         |-TemplateArgument expr
+// CHECK-NEXT:         |-TemplateArgument expr '0'
 // CHECK-NEXT:         | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int'
 // CHECK-NEXT:         |   |-value: Int 0
 // CHECK-NEXT:         |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:42> 'int' 0
@@ -25,24 +26,25 @@ using test1 = __make_integer_seq<A, int, 1>;
 
 template <class B1, B1 B2> using B = __make_integer_seq<A, B1, B2>;
 using test2 = B<int, 1>;
-//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:27:1, col:23> col:7 test2 'B<int, 1>':'A<int, 0>'
+//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:23> col:7 test2 'B<int, 1>':'A<int, 0>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} 'B<int, 1>' sugar
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} 'B<int, 1>' sugar alias B
 // CHECK-NEXT:       |-TemplateArgument type 'int'
 // CHECK-NEXT:       | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:       |-TemplateArgument expr
+// CHECK-NEXT:       |-TemplateArgument expr '1'
 // CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:22> 'int'
 // CHECK-NEXT:       |   |-value: Int 1
 // CHECK-NEXT:       |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:22> 'int' 1
 // CHECK-NEXT:       `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar
 // CHECK-NEXT:         `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar alias __make_integer_seq
-// CHECK-NEXT:           |-TemplateArgument template A
+// CHECK-NEXT:           |-TemplateArgument template 'A'
+// CHECK-NEXT:           | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:           |-TemplateArgument type 'int'
 // CHECK-NEXT:           | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1
 // CHECK-NEXT:           |   |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B'
 // CHECK-NEXT:           |   `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:           |-TemplateArgument expr
-// CHECK-NEXT:           | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <line:26:64> 'int'
+// CHECK-NEXT:           |-TemplateArgument expr '1'
+// CHECK-NEXT:           | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:64> 'int'
 // CHECK-NEXT:           |   |-value: Int 1
 // CHECK-NEXT:           |   `-SubstNonTypeTemplateParmExpr 0x{{[0-9A-Fa-f]+}} <col:64> 'int'
 // CHECK-NEXT:           |     |-NonTypeTemplateParmDecl 0x{{[0-9A-Fa-f]+}} <col:21, col:24> col:24 referenced 'B1' depth 0 index 1 B2
@@ -52,7 +54,7 @@ using test2 = B<int, 1>;
 // CHECK-NEXT:             | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1
 // CHECK-NEXT:             |   |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B'
 // CHECK-NEXT:             |   `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:             |-TemplateArgument expr
+// CHECK-NEXT:             |-TemplateArgument expr '0'
 // CHECK-NEXT:             | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:64> 'int'
 // CHECK-NEXT:             |   |-value: Int 0
 // CHECK-NEXT:             |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:64> 'int' 0
@@ -61,58 +63,64 @@ using test2 = B<int, 1>;
 
 template <template <class T, T...> class S, class T, int N> struct C {
   using test3 = __make_integer_seq<S, T, N>;
-//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:63:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>'
+//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent alias __make_integer_seq
-// CHECK-NEXT:       |-TemplateArgument template S
+// CHECK-NEXT:       |-TemplateArgument template 'S'
+// CHECK-NEXT:       | | `-TemplateTemplateParmDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:11, col:42> col:42 depth 0 index 0 S
 // CHECK-NEXT:       |-TemplateArgument type 'T'
 // CHECK-NEXT:       | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'T' dependent depth 0 index 1
 // CHECK-NEXT:       |   `-TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'T'
-// CHECK-NEXT:       |-TemplateArgument expr
-// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
+// CHECK-NEXT:       |-TemplateArgument expr 'N'
+// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:42> 'T' <Dependent>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>' dependent __make_integer_seq
-// CHECK-NEXT:         |-TemplateArgument template
+// CHECK-NEXT:         |-TemplateArgument template 'template-parameter-0-0'
+// CHECK-NEXT:         | `-TemplateTemplateParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> depth 0 index 0
 // CHECK-NEXT:         |-TemplateArgument type 'type-parameter-0-1'
 // CHECK-NEXT:         | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent depth 0 index 1
-// CHECK-NEXT:         `-TemplateArgument expr
+// CHECK-NEXT:         `-TemplateArgument expr 'N'
 // CHECK-NEXT:           `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
 // CHECK-NEXT:             `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 
   using test4 = __make_integer_seq<A, T, 1>;
-//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:82:3, col:43> col:9 test4 '__make_integer_seq<A, T, 1>':'__make_integer_seq<A, type-parameter-0-1, 1>'
+//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:3, col:43> col:9 test4 '__make_integer_seq<A, T, 1>':'__make_integer_seq<A, type-parameter-0-1, 1>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, T, 1>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, T, 1>' sugar dependent alias __make_integer_seq
-// CHECK-NEXT:       |-TemplateArgument template A
+// CHECK-NEXT:       |-TemplateArgument template 'A'
+// CHECK-NEXT:       | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:       |-TemplateArgument type 'T'
 // CHECK-NEXT:       | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'T' dependent depth 0 index 1
 // CHECK-NEXT:       |   `-TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'T'
-// CHECK-NEXT:       |-TemplateArgument expr
-// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
+// CHECK-NEXT:       |-TemplateArgument expr '1'
+// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:42> 'T' <Dependent>
 // CHECK-NEXT:       |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:42> 'int' 1
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, type-parameter-0-1, 1>' dependent __make_integer_seq
-// CHECK-NEXT:         |-TemplateArgument template A
+// CHECK-NEXT:         |-TemplateArgument template 'A'
+// CHECK-NEXT:         | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:         |-TemplateArgument type 'type-parameter-0-1'
 // CHECK-NEXT:         | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent depth 0 index 1
-// CHECK-NEXT:         `-TemplateArgument expr
-// CHECK-NEXT:           `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
+// CHECK-NEXT:         `-TemplateArgument expr '1'
+// CHECK-NEXT:           `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:42> 'T' <Dependent>
 // CHECK-NEXT:             `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:42> 'int' 1
 
   using test5 = __make_integer_seq<A, int, N>;
-//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:101:3, col:45> col:9 test5 '__make_integer_seq<A, int, N>'
+//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:3, col:45> col:9 test5 '__make_integer_seq<A, int, N>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, N>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, N>' sugar dependent alias __make_integer_seq
-// CHECK-NEXT:       |-TemplateArgument template A
+// CHECK-NEXT:       |-TemplateArgument template 'A'
+// CHECK-NEXT:       | `-ClassTemplateDecl 0x{{.+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:       |-TemplateArgument type 'int'
 // CHECK-NEXT:       | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:       |-TemplateArgument expr
-// CHECK-NEXT:       | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:44> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
+// CHECK-NEXT:       |-TemplateArgument expr 'N'
+// CHECK-NEXT:       | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:44> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, N>' dependent __make_integer_seq
-// CHECK-NEXT:         |-TemplateArgument template A
+// CHECK-NEXT:         |-TemplateArgument template 'A'
+// CHECK-NEXT:         | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:1, col:41> col:38 A
 // CHECK-NEXT:         |-TemplateArgument type 'int'
 // CHECK-NEXT:         | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
-// CHECK-NEXT:         `-TemplateArgument expr
-// CHECK-NEXT:           `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:44> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
+// CHECK-NEXT:         `-TemplateArgument expr 'N'
+// CHECK-NEXT:           `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <line:{{.+}}:44> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 };
 
 // expected-no-diagnostics
diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp
index 9e23ef1ff3cfb6..abf0ddbddeafd4 100644
--- a/clang/test/SemaTemplate/type_pack_element.cpp
+++ b/clang/test/SemaTemplate/type_pack_element.cpp
@@ -4,7 +4,7 @@ using test1 = __type_pack_element<0, int>;
 //      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <<stdin>:3:1, col:41> col:7 test1 '__type_pack_element<0, int>':'int'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<0, int>' sugar
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<0, int>' sugar alias __type_pack_element
-// CHECK-NEXT:       |-TemplateArgument expr
+// CHECK-NEXT:       |-TemplateArgument expr '0'
 // CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:35> 'unsigned long'
 // CHECK-NEXT:       |   |-value: Int 0
 // CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:35> 'unsigned long' <IntegralCast>
@@ -18,7 +18,7 @@ template<int N, class ...Ts> struct A {
 //      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:17:3, col:45> col:9 test2 '__type_pack_element<N, Ts...>':'__type_pack_element<N, type-parameter-0-1...>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, Ts...>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, Ts...>' sugar dependent alias __type_pack_element
-// CHECK-NEXT:       |-TemplateArgument expr
+// CHECK-NEXT:       |-TemplateArgument expr 'N'
 // CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       |-TemplateArgument type 'Ts...'
@@ -26,10 +26,10 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:       |   `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
 // CHECK-NEXT:       |     `-TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'Ts'
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, type-parameter-0-1...>' dependent __type_pack_element
-// CHECK-NEXT:         |-TemplateArgument expr
+// CHECK-NEXT:         |-TemplateArgument expr 'N'
 // CHECK-NEXT:         | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
 // CHECK-NEXT:         |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
-// CHECK-NEXT:         `-TemplateArgument pack
+// CHECK-NEXT:         `-TemplateArgument pack '<type-parameter-0-1...>'
 // CHECK-NEXT:           `-TemplateArgument type 'type-parameter-0-1...'
 // CHECK-NEXT:             `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1...' dependent
 // CHECK-NEXT:               `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
@@ -38,7 +38,7 @@ template<int N, class ...Ts> struct A {
 //      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:37:3, col:45> col:9 test3 '__type_pack_element<0, Ts...>':'__type_pack_element<0, type-parameter-0-1...>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<0, Ts...>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<0, Ts...>' sugar dependent alias __type_pack_element
-// CHECK-NEXT:       |-TemplateArgument expr
+// CHECK-NEXT:       |-TemplateArgument expr '0'
 // CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long'
 // CHECK-NEXT:       |   |-value: Int 0
 // CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
@@ -48,8 +48,8 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:       |   `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
 // CHECK-NEXT:       |     `-TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'Ts'
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<0, type-parameter-0-1...>' dependent __type_pack_element
-// CHECK-NEXT:         |-TemplateArgument integral 0
-// CHECK-NEXT:         `-TemplateArgument pack
+// CHECK-NEXT:         |-TemplateArgument integral '0UL'
+// CHECK-NEXT:         `-TemplateArgument pack '<type-parameter-0-1...>'
 // CHECK-NEXT:           `-TemplateArgument type 'type-parameter-0-1...'
 // CHECK-NEXT:             `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1...' dependent
 // CHECK-NEXT:               `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
@@ -58,16 +58,16 @@ template<int N, class ...Ts> struct A {
 //      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:57:3, col:43> col:9 test4 '__type_pack_element<N, int>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, int>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, int>' sugar dependent alias __type_pack_element
-// CHECK-NEXT:       |-TemplateArgument expr
+// CHECK-NEXT:       |-TemplateArgument expr 'N'
 // CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       |-TemplateArgument type 'int'
 // CHECK-NEXT:       | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, int>' dependent __type_pack_element
-// CHECK-NEXT:         |-TemplateArgument expr
+// CHECK-NEXT:         |-TemplateArgument expr 'N'
 // CHECK-NEXT:         | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
 // CHECK-NEXT:         |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
-// CHECK-NEXT:         `-TemplateArgument pack
+// CHECK-NEXT:         `-TemplateArgument pack '<int>'
 // CHECK-NEXT:           `-TemplateArgument type 'int'
 // CHECK-NEXT:             `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 };

From d2f7a38b7db78adb5c36fff1f7e12814be1a6c30 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 29 May 2024 11:25:47 -0700
Subject: [PATCH 158/230] [X86] Fix a warning

This patch fixes:

  llvm/lib/Target/X86/X86ISelLowering.cpp:50832:7: error: unused
  variable 'MemVT' [-Werror,-Wunused-variable]
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 24340e135b08b9..ac30e8846be559 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -50829,7 +50829,6 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
                                         const X86Subtarget &Subtarget) {
   auto *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
-  EVT MemVT = Ld->getMemoryVT();
   SDValue Ptr = Ld->getBasePtr();
   SDValue Chain = Ld->getChain();
   ISD::LoadExtType Ext = Ld->getExtensionType();

From dcbd1fbdf46e74e9be80ec8e3e865b8347e8532b Mon Sep 17 00:00:00 2001
From: thetruestblue <92476612+thetruestblue@users.noreply.github.com>
Date: Wed, 29 May 2024 11:31:17 -0700
Subject: [PATCH 159/230] [TEST][Darwin] Change x86_64h UNSUPPORTED lit feature
 used in san cov test (#93706)

Fix x86_64 lit feature. x86-target-arch not set for x86_64h

x86-target-arch not set for x86_64.
---
 .../TestCases/sanitizer_coverage_trace_pc_guard.cpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
index ee47a1228fcc5c..eea92d0ba9e838 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
@@ -3,7 +3,7 @@
 // REQUIRES: has_sancovcc
 // UNSUPPORTED: ubsan,i386-darwin,target={{(powerpc64|s390x|thumb).*}}
 // This test is failing for lsan on darwin on x86_64h.
-// UNSUPPORTED: darwin && x86-target-arch && lsan
+// UNSUPPORTED: x86_64h-darwin && lsan
 // XFAIL: tsan
 // XFAIL: android && asan
 

From 9fe7aef1889300a17a594efb55358ebd032a81a2 Mon Sep 17 00:00:00 2001
From: aengelke <engelke@in.tum.de>
Date: Wed, 29 May 2024 20:38:34 +0200
Subject: [PATCH 160/230] [CodeGen] Don't check attrs for stack realign
 (#92564)

shouldRealignStack/canRealignStack are repeatedly called in PEI (through
hasStackRealignment). Checking function attributes is expensive, so
cache this data in the MachineFrameInfo, which had most data already.

This slightly changes the semantics of `MachineFrameInfo::ForcedRealign`
to be also true when the `stackrealign` attribute is set.
---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h | 8 ++++++++
 llvm/lib/CodeGen/MachineFunction.cpp         | 5 +++--
 llvm/lib/CodeGen/TargetRegisterInfo.cpp      | 9 ++-------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index a2c78e9e093d01..466fed7fb3a297 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -346,6 +346,8 @@ class MachineFrameInfo {
 
   MachineFrameInfo(const MachineFrameInfo &) = delete;
 
+  bool isStackRealignable() const { return StackRealignable; }
+
   /// Return true if there are any stack objects in this function.
   bool hasStackObjects() const { return !Objects.empty(); }
 
@@ -603,6 +605,12 @@ class MachineFrameInfo {
   /// Make sure the function is at least Align bytes aligned.
   void ensureMaxAlignment(Align Alignment);
 
+  /// Return true if stack realignment is forced by function attributes or if
+  /// the stack alignment.
+  bool shouldRealignStack() const {
+    return ForcedRealign || MaxAlignment > StackAlignment;
+  }
+
   /// Return true if this function adjusts the stack -- e.g.,
   /// when calling another function. This is only valid during and after
   /// prolog/epilog code insertion.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 8366ad2859069f..4182e753541258 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -200,10 +200,11 @@ void MachineFunction::init() {
   // explicitly asked us not to.
   bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
                       !F.hasFnAttribute("no-realign-stack");
+  bool ForceRealignSP = F.hasFnAttribute(Attribute::StackAlignment) ||
+                        F.hasFnAttribute("stackrealign");
   FrameInfo = new (Allocator) MachineFrameInfo(
       getFnStackAlignment(STI, F), /*StackRealignable=*/CanRealignSP,
-      /*ForcedRealign=*/CanRealignSP &&
-          F.hasFnAttribute(Attribute::StackAlignment));
+      /*ForcedRealign=*/ForceRealignSP && CanRealignSP);
 
   setUnsafeStackSize(F, *FrameInfo);
 
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 4e06393f4cc1d3..ffc8055dd27e8f 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -474,16 +474,11 @@ bool TargetRegisterInfo::isCalleeSavedPhysReg(
 }
 
 bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  return !MF.getFunction().hasFnAttribute("no-realign-stack");
+  return MF.getFrameInfo().isStackRealignable();
 }
 
 bool TargetRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  const Function &F = MF.getFunction();
-  return F.hasFnAttribute("stackrealign") ||
-         (MFI.getMaxAlign() > TFI->getStackAlign()) ||
-         F.hasFnAttribute(Attribute::StackAlignment);
+  return MF.getFrameInfo().shouldRealignStack();
 }
 
 bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,

From 472184db24f2f5f7ddca76d7aadd13a3dca05991 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Wed, 29 May 2024 11:40:14 -0700
Subject: [PATCH 161/230] [bazel] Port #93595 (#93716)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 5d2248a8fe3608..e624d1fc67f446 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6031,6 +6031,7 @@ cc_library(
         ":SPIRVDialect",
         ":Support",
         ":TransformUtils",
+        ":VectorToSPIRV",
         "//llvm:Support",
     ],
 )

From 8aceb7a53d82f9566a7cf4775effa4089b22a75b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 11:47:21 -0700
Subject: [PATCH 162/230] [ValueTypes] Remove MVT::MAX_ALLOWED_VALUETYPE. NFC
 (#93654)

Despite the comment, this isn't used to size bit vectors or tables.
That's done by VALUETYPE_SIZE. MAX_ALLOWED_VALUETYPE is only used by
some static_asserts that compare it to VALUETYPE_SIZE.

This patch removes it and most of the static_asserts. I left one where I
compared VALUETYPE_SIZE to token which is the first type that isn't part
of the VALUETYPE range. This isn't strictly needed, we'd probably catch
duplication error from VTEmitter.cpp first.
---
 llvm/include/llvm/CodeGenTypes/MachineValueType.h | 7 +------
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h     | 2 --
 llvm/lib/CodeGen/TargetLoweringBase.cpp           | 3 ---
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index 3b2a9b535c0945..e008503f734b95 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -45,15 +45,10 @@ namespace llvm {
 #undef GET_VT_RANGES
 
       VALUETYPE_SIZE = LAST_VALUETYPE + 1,
-
-      // This is the current maximum for LAST_VALUETYPE.
-      // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
-      // This value must be a multiple of 32.
-      MAX_ALLOWED_VALUETYPE = 224,
     };
 
     static_assert(FIRST_VALUETYPE > 0);
-    static_assert(LAST_VALUETYPE < MAX_ALLOWED_VALUETYPE);
+    static_assert(LAST_VALUETYPE < token);
 
     SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ba3c7582d5a8a2..bec9cb49b58649 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -168,8 +168,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   explicit DAGTypeLegalizer(SelectionDAG &dag)
     : TLI(dag.getTargetLoweringInfo()), DAG(dag),
     ValueTypeActions(TLI.getValueTypeActions()) {
-    static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
-                  "Too many value types for ValueTypeActions to hold!");
   }
 
   /// This is the main entry point for the type legalizer.  This does a
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 82a59918b085b3..f2e4632b248f4b 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1430,9 +1430,6 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
 /// this allows us to compute derived properties we expose.
 void TargetLoweringBase::computeRegisterProperties(
     const TargetRegisterInfo *TRI) {
-  static_assert(MVT::VALUETYPE_SIZE <= MVT::MAX_ALLOWED_VALUETYPE,
-                "Too many value types for ValueTypeActions to hold!");
-
   // Everything defaults to needing one register.
   for (unsigned i = 0; i != MVT::VALUETYPE_SIZE; ++i) {
     NumRegistersForVT[i] = 1;

From b3bbb2de6fab74b714f38c0bf0822e1634b0d158 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 11:53:01 -0700
Subject: [PATCH 163/230] [RISCV] Verify the VL and Mask on the outer
 TRUNCATE_VECTOR_VL in combineTruncOfSraSext. (#93578)

We checked the VL and mask of any additional TRUNCATE_VECTOR_VL
nodes we peek through, but not the outermost.

This moves the check to the outer node and then verifies all the
additional nodes have the same VL and Mask.

Stacked on #93574
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 35 +++++++++++----------
 llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll  | 14 ++++++---
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e99c6208594e3b..f4da46f82a8108 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16128,23 +16128,26 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask,
   return true;
 }
 
+// trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
+// This would be benefit for the cases where X and Y are both the same value
+// type of low precision vectors. Since the truncate would be lowered into
+// n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
+// restriction, such pattern would be expanded into a series of "vsetvli"
+// and "vnsrl" instructions later to reach this point.
 static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
-  // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
-  // This would be benefit for the cases where X and Y are both the same value
-  // type of low precision vectors. Since the truncate would be lowered into
-  // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
-  // restriction, such pattern would be expanded into a series of "vsetvli"
-  // and "vnsrl" instructions later to reach this point.
-  auto IsTruncNode = [](SDValue V) {
-    if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
-      return false;
-    SDValue VL = V.getOperand(2);
-    auto *C = dyn_cast<ConstantSDNode>(VL);
-    // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
-    bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
-                           (isa<RegisterSDNode>(VL) &&
-                            cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
-    return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET;
+  SDValue Mask = N->getOperand(1);
+  SDValue VL = N->getOperand(2);
+
+  bool IsVLMAX = isAllOnesConstant(VL) ||
+                 (isa<RegisterSDNode>(VL) &&
+                  cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
+  if (!IsVLMAX || Mask.getOpcode() != RISCVISD::VMSET_VL ||
+      Mask.getOperand(0) != VL)
+    return SDValue();
+
+  auto IsTruncNode = [&](SDValue V) {
+    return V.getOpcode() == RISCVISD::TRUNCATE_VECTOR_VL &&
+           V.getOperand(1) == Mask && V.getOperand(2) == VL;
   };
 
   SDValue Op = N->getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
index 8dbb57fd15cf16..382c8297473b78 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
@@ -937,13 +937,17 @@ define <vscale x 8 x i32> @vsra_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
 
 ; Negative test. We shouldn't look through the vp.trunc as it isn't vlmax like
 ; the rest of the code.
-define <vscale x 1 x i8> @vsra_vv_nxv1i8_sext_zext_mixed_trunc(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl) {
+define <vscale x 1 x i8> @vsra_vv_nxv1i8_sext_zext_mixed_trunc(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vsra_vv_nxv1i8_sext_zext_mixed_trunc:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 7
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmin.vx v9, v8, a0
-; CHECK-NEXT:    vsra.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf4 v9, v8
+; CHECK-NEXT:    vzext.vf4 v10, v8
+; CHECK-NEXT:    vsra.vv v8, v9, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    ret
   %sexted_va = sext <vscale x 1 x i8> %va to <vscale x 1 x i32>
   %zexted_vb = zext <vscale x 1 x i8> %va to <vscale x 1 x i32>

From 99b9ab45cd67648a7b6c2ba02041072fe4de346b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 29 May 2024 12:18:24 -0700
Subject: [PATCH 164/230] [memprof] Reorder MemProf sections in profile
 (#93640)

This patch teaches the V3 format to serialize Frames, call stacks, and
IndexedMemProfRecords, in that order.

I'm planning to use linear IDs for Frames.  That is, Frames will be
numbered 0, 1, 2, and so on in the order we serialize them.  In turn,
we will seialize the call stacks in terms of those linear IDs.

Likewise, I'm planning to use linear IDs for call stacks and then
serialize IndexedMemProfRecords in terms of those linear IDs for call
stacks.

With the new order, we can successively free data structures as we
serialize them.  That is, once we serialize Frames, we can free the
Frames' data proper and just retain mappings from FrameIds to linear
IDs.  A similar story applies to call stacks.
---
 .../llvm/ProfileData/InstrProfReader.h        |   5 +
 llvm/lib/ProfileData/InstrProfReader.cpp      | 124 ++++++++++++++----
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  22 ++--
 3 files changed, 111 insertions(+), 40 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 46aa1b6c2bfe7c..8d475fb0486248 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -660,6 +660,11 @@ class IndexedMemProfReader {
   /// MemProf call stack data on-disk indexed via call stack id.
   std::unique_ptr<MemProfCallStackHashTable> MemProfCallStackTable;
 
+  Error deserializeV012(const unsigned char *Start, const unsigned char *Ptr,
+                        uint64_t FirstWord, memprof::IndexedVersion Version);
+  Error deserializeV3(const unsigned char *Start, const unsigned char *Ptr,
+                      memprof::IndexedVersion Version);
+
 public:
   IndexedMemProfReader() = default;
 
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index a5ae0c6fa62444..1b36ca1a733a2c 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1202,35 +1202,10 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   }
 }
 
-Error IndexedMemProfReader::deserialize(const unsigned char *Start,
-                                        uint64_t MemProfOffset) {
-  const unsigned char *Ptr = Start + MemProfOffset;
-
-  // Read the first 64-bit word, which may be RecordTableOffset in
-  // memprof::MemProfVersion0 or the MemProf version number in
-  // memprof::MemProfVersion1 and above.
-  const uint64_t FirstWord =
-      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
-
-  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 ||
-      FirstWord == memprof::Version3) {
-    // Everything is good.  We can proceed to deserialize the rest.
-    Version = static_cast<memprof::IndexedVersion>(FirstWord);
-  } else if (FirstWord >= 24) {
-    // This is a heuristic/hack to detect memprof::MemProfVersion0,
-    // which does not have a version field in the header.
-    // In memprof::MemProfVersion0, FirstWord will be RecordTableOffset,
-    // which should be at least 24 because of the MemProf header size.
-    Version = memprof::Version0;
-  } else {
-    return make_error<InstrProfError>(
-        instrprof_error::unsupported_version,
-        formatv("MemProf version {} not supported; "
-                "requires version between {} and {}, inclusive",
-                FirstWord, memprof::MinimumSupportedVersion,
-                memprof::MaximumSupportedVersion));
-  }
-
+Error IndexedMemProfReader::deserializeV012(const unsigned char *Start,
+                                            const unsigned char *Ptr,
+                                            uint64_t FirstWord,
+                                            memprof::IndexedVersion Version) {
   // The value returned from RecordTableGenerator.Emit.
   const uint64_t RecordTableOffset =
       Version == memprof::Version0
@@ -1280,6 +1255,97 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
         /*Payload=*/Start + CallStackPayloadOffset,
         /*Base=*/Start));
 
+  return Error::success();
+}
+
+Error IndexedMemProfReader::deserializeV3(const unsigned char *Start,
+                                          const unsigned char *Ptr,
+                                          memprof::IndexedVersion Version) {
+  // The value returned from FrameTableGenerator.Emit.
+  const uint64_t FrameTableOffset =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  // The offset in the stream right before invoking
+  // CallStackTableGenerator.Emit.
+  const uint64_t CallStackPayloadOffset =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  // The value returned from CallStackTableGenerator.Emit.
+  const uint64_t CallStackTableOffset =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  // The offset in the stream right before invoking RecordTableGenerator.Emit.
+  const uint64_t RecordPayloadOffset =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  // The value returned from RecordTableGenerator.Emit.
+  const uint64_t RecordTableOffset =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+
+  // Read the schema.
+  auto SchemaOr = memprof::readMemProfSchema(Ptr);
+  if (!SchemaOr)
+    return SchemaOr.takeError();
+  Schema = SchemaOr.get();
+
+  // Initialize the frame table reader with the payload and bucket offsets.
+  MemProfFrameTable.reset(MemProfFrameHashTable::Create(
+      /*Buckets=*/Start + FrameTableOffset,
+      /*Payload=*/Ptr,
+      /*Base=*/Start));
+
+  MemProfCallStackTable.reset(MemProfCallStackHashTable::Create(
+      /*Buckets=*/Start + CallStackTableOffset,
+      /*Payload=*/Start + CallStackPayloadOffset,
+      /*Base=*/Start));
+
+  // Now initialize the table reader with a pointer into data buffer.
+  MemProfRecordTable.reset(MemProfRecordHashTable::Create(
+      /*Buckets=*/Start + RecordTableOffset,
+      /*Payload=*/Start + RecordPayloadOffset,
+      /*Base=*/Start, memprof::RecordLookupTrait(Version, Schema)));
+
+  return Error::success();
+}
+
+Error IndexedMemProfReader::deserialize(const unsigned char *Start,
+                                        uint64_t MemProfOffset) {
+  const unsigned char *Ptr = Start + MemProfOffset;
+
+  // Read the first 64-bit word, which may be RecordTableOffset in
+  // memprof::MemProfVersion0 or the MemProf version number in
+  // memprof::MemProfVersion1 and above.
+  const uint64_t FirstWord =
+      support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+
+  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 ||
+      FirstWord == memprof::Version3) {
+    // Everything is good.  We can proceed to deserialize the rest.
+    Version = static_cast<memprof::IndexedVersion>(FirstWord);
+  } else if (FirstWord >= 24) {
+    // This is a heuristic/hack to detect memprof::MemProfVersion0,
+    // which does not have a version field in the header.
+    // In memprof::MemProfVersion0, FirstWord will be RecordTableOffset,
+    // which should be at least 24 because of the MemProf header size.
+    Version = memprof::Version0;
+  } else {
+    return make_error<InstrProfError>(
+        instrprof_error::unsupported_version,
+        formatv("MemProf version {} not supported; "
+                "requires version between {} and {}, inclusive",
+                FirstWord, memprof::MinimumSupportedVersion,
+                memprof::MaximumSupportedVersion));
+  }
+
+  switch (Version) {
+  case memprof::Version0:
+  case memprof::Version1:
+  case memprof::Version2:
+    if (Error E = deserializeV012(Start, Ptr, FirstWord, Version))
+      return E;
+    break;
+  case memprof::Version3:
+    if (Error E = deserializeV3(Start, Ptr, Version))
+      return E;
+    break;
+  }
+
 #ifdef EXPENSIVE_CHECKS
   // Go through all the records and verify that CSId has been correctly
   // populated.  Do this only under EXPENSIVE_CHECKS.  Otherwise, we
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e732882337d468..7e0c9a159d9328 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -619,48 +619,48 @@ static Error writeMemProfV2(ProfOStream &OS,
 
 // Write out MemProf Version3 as follows:
 // uint64_t Version
-// uint64_t RecordTableOffset = RecordTableGenerator.Emit
-// uint64_t FramePayloadOffset = Offset for the frame payload
 // uint64_t FrameTableOffset = FrameTableGenerator.Emit
 // uint64_t CallStackPayloadOffset = Offset for the call stack payload
 // uint64_t CallStackTableOffset = CallStackTableGenerator.Emit
+// uint64_t RecordPayloadOffset = Offset for the record payload
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
 // uint64_t Num schema entries
 // uint64_t Schema entry 0
 // uint64_t Schema entry 1
 // ....
 // uint64_t Schema entry N - 1
-// OnDiskChainedHashTable MemProfRecordData
 // OnDiskChainedHashTable MemProfFrameData
 // OnDiskChainedHashTable MemProfCallStackData
+// OnDiskChainedHashTable MemProfRecordData
 static Error writeMemProfV3(ProfOStream &OS,
                             memprof::IndexedMemProfData &MemProfData,
                             bool MemProfFullSchema) {
   OS.write(memprof::Version3);
   uint64_t HeaderUpdatePos = OS.tell();
-  OS.write(0ULL); // Reserve space for the memprof record table offset.
-  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
   OS.write(0ULL); // Reserve space for the memprof frame table offset.
   OS.write(0ULL); // Reserve space for the memprof call stack payload offset.
   OS.write(0ULL); // Reserve space for the memprof call stack table offset.
+  OS.write(0ULL); // Reserve space for the memprof record payload offset.
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
 
   auto Schema = memprof::getHotColdSchema();
   if (MemProfFullSchema)
     Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
-  uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData,
-                                                   &Schema, memprof::Version3);
-
-  uint64_t FramePayloadOffset = OS.tell();
   uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData);
 
   uint64_t CallStackPayloadOffset = OS.tell();
   uint64_t CallStackTableOffset =
       writeMemProfCallStacks(OS, MemProfData.CallStackData);
 
+  uint64_t RecordPayloadOffset = OS.tell();
+  uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData,
+                                                   &Schema, memprof::Version3);
+
   uint64_t Header[] = {
-      RecordTableOffset,      FramePayloadOffset,   FrameTableOffset,
-      CallStackPayloadOffset, CallStackTableOffset,
+      FrameTableOffset,    CallStackPayloadOffset, CallStackTableOffset,
+      RecordPayloadOffset, RecordTableOffset,
   };
   OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
 

From 7348bb23abdf59e503c815037ce9835f5dd15df2 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Wed, 29 May 2024 15:32:38 -0400
Subject: [PATCH 165/230] Revert "[clang] Add tanf16 builtin and support for
 tan constrained intrinsic (#93314)" (#93721)

This reverts commit b15a0a37404f36bcd9c7995de8cd16f9cb5ac8af.

This should undo PR: https://github.com/llvm/llvm-project/pull/93314
will need to re-open https://github.com/llvm/llvm-project/issues/91421

wait for https://github.com/llvm/llvm-project/pull/90503 to land
---
 clang/include/clang/Basic/Builtins.td         |  6 ++--
 clang/lib/CodeGen/CGBuiltin.cpp               | 12 -------
 clang/test/CodeGen/X86/math-builtins.c        |  8 ++---
 .../test/CodeGen/constrained-math-builtins.c  | 13 -------
 clang/test/CodeGen/math-libcalls.c            | 12 +++----
 clang/test/CodeGenOpenCL/builtins-f16.cl      |  3 --
 llvm/docs/LangRef.rst                         | 36 -------------------
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  2 --
 llvm/include/llvm/IR/ConstrainedOps.def       |  1 -
 llvm/include/llvm/IR/Intrinsics.td            |  4 ---
 llvm/test/Assembler/fp-intrinsics-attr.ll     |  8 -----
 llvm/test/Feature/fp-intrinsics.ll            | 11 ------
 12 files changed, 13 insertions(+), 103 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 7bef5fd7ad40f2..11982af3fa609b 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -482,11 +482,11 @@ def SqrtF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def TanF16F128 : Builtin, F16F128MathTemplate {
-  let Spellings = ["__builtin_tan"];
+def TanF128 : Builtin {
+  let Spellings = ["__builtin_tanf128"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "T(T)";
+  let Prototype = "__float128(__float128)";
 }
 
 def TanhF128 : Builtin {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 94a7036f6233cc..266bf41fd5577c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2923,18 +2923,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       SetSqrtFPAccuracy(Call);
       return RValue::get(Call);
     }
-
-    case Builtin::BItan:
-    case Builtin::BItanf:
-    case Builtin::BItanl:
-    case Builtin::BI__builtin_tan:
-    case Builtin::BI__builtin_tanf:
-    case Builtin::BI__builtin_tanf16:
-    case Builtin::BI__builtin_tanl:
-    case Builtin::BI__builtin_tanf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
-          *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan));
-
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 1e0f129b986102..093239b4482609 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -674,10 +674,10 @@ __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_
 
 __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
 
-// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.tan.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
+// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
+// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare fp128 @tanf128(fp128 noundef) [[READNONE]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index 6cc3a10a1e7946..2de832dd2b6cae 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -183,14 +183,6 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
-  __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
-
-// CHECK: call double @llvm.experimental.constrained.tan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK: call float @llvm.experimental.constrained.tan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
-
-
   __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.trunc.f64(double %{{.*}}, metadata !"fpexcept.strict")
@@ -323,11 +315,6 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata)
 // CHECK: declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 
-// CHECK: declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
-// CHECK: declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
-// CHECK: declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
-// CHECK: declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
-
 // CHECK: declare double @llvm.experimental.constrained.trunc.f64(double, metadata)
 // CHECK: declare float @llvm.experimental.constrained.trunc.f32(float, metadata)
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index a249182692762d..29c312ba0ecac2 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -662,15 +662,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   tan(f);        tanf(f);       tanl(f);
 
-// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
+// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
+// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tan.f64(
-// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tan.f32(
-// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tan.f80(
+// HAS_MAYTRAP: declare double @tan(double noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare float @tanf(float noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
 
   tanh(f);       tanhf(f);      tanhl(f);
 
diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl
index d7bffdad5c548f..adf7cdde154f51 100644
--- a/clang/test/CodeGenOpenCL/builtins-f16.cl
+++ b/clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -66,9 +66,6 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.sqrt.f16(half %h0)
   res = __builtin_sqrtf16(h0);
 
-  // CHECK: call half @llvm.tan.f16(half %h0)
-  res = __builtin_tanf16(h0);
-
   // CHECK: call half @llvm.trunc.f16(half %h0)
   res = __builtin_truncf16(h0);
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a650692d44d76e..7b64c477d13c7f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -26229,42 +26229,6 @@ same values as the libm ``cos`` functions would, and handles error
 conditions in the same way.
 
 
-'``llvm.experimental.constrained.tan``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-::
-
-      declare <type>
-      @llvm.experimental.constrained.tan(<type> <op1>,
-                                         metadata <rounding mode>,
-                                         metadata <exception behavior>)
-
-Overview:
-"""""""""
-
-The '``llvm.experimental.constrained.tan``' intrinsic returns the tangent of the
-first operand.
-
-Arguments:
-""""""""""
-
-The first argument and the return type are floating-point numbers of the same
-type.
-
-The second and third arguments specify the rounding mode and exception
-behavior as described above.
-
-Semantics:
-""""""""""
-
-This function returns the tangent of the specified operand, returning the
-same values as the libm ``tan`` functions would, and handles error
-conditions in the same way.
-
-
 '``llvm.experimental.constrained.exp``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 22062f0efbbda1..d8af97957e48ec 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -415,7 +415,6 @@ enum NodeType {
   STRICT_FLDEXP,
   STRICT_FSIN,
   STRICT_FCOS,
-  STRICT_FTAN,
   STRICT_FEXP,
   STRICT_FEXP2,
   STRICT_FLOG,
@@ -935,7 +934,6 @@ enum NodeType {
   FCBRT,
   FSIN,
   FCOS,
-  FTAN,
   FPOW,
   FPOWI,
   /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index a7b37c5cb204da..41aa44de957f93 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -95,7 +95,6 @@ DAG_FUNCTION(round,           1, 0, experimental_constrained_round,      FROUND)
 DAG_FUNCTION(roundeven,       1, 0, experimental_constrained_roundeven,  FROUNDEVEN)
 DAG_FUNCTION(sin,             1, 1, experimental_constrained_sin,        FSIN)
 DAG_FUNCTION(sqrt,            1, 1, experimental_constrained_sqrt,       FSQRT)
-DAG_FUNCTION(tan,             1, 1, experimental_constrained_tan,        FTAN)
 DAG_FUNCTION(trunc,           1, 0, experimental_constrained_trunc,      FTRUNC)
 
 // This is definition for fmuladd intrinsic function, that is converted into
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4c506a6ace23ea..107442623ab7bd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1218,10 +1218,6 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
-  def int_experimental_constrained_tan  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
-                                                    [ LLVMMatchType<0>,
-                                                      llvm_metadata_ty,
-                                                      llvm_metadata_ty ]>;
   def int_experimental_constrained_pow  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll
index 613630e1a2b4d2..6546d1a275c99f 100644
--- a/llvm/test/Assembler/fp-intrinsics-attr.ll
+++ b/llvm/test/Assembler/fp-intrinsics-attr.ll
@@ -85,11 +85,6 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp {
                                                metadata !"round.dynamic",
                                                metadata !"fpexcept.strict")
 
-  %tan = call double @llvm.experimental.constrained.tan.f64(
-                                               double %a,
-                                               metadata !"round.dynamic",
-                                               metadata !"fpexcept.strict")
-
   %pow = call double @llvm.experimental.constrained.pow.f64(
                                                double %a, double %b,
                                                metadata !"round.dynamic",
@@ -249,9 +244,6 @@ declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.cos.f64({{.*}}) #[[ATTR1]]
 
-declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
-; CHECK: @llvm.experimental.constrained.tan.f64({{.*}}) #[[ATTR1]]
-
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.pow.f64({{.*}}) #[[ATTR1]]
 
diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll
index 7759813dc2e114..b92408a1bf1cd5 100644
--- a/llvm/test/Feature/fp-intrinsics.ll
+++ b/llvm/test/Feature/fp-intrinsics.ll
@@ -151,17 +151,6 @@ entry:
   ret double %result
 }
 
-; Verify that tan(42.0) isn't simplified when the rounding mode is unknown.
-; CHECK-LABEL: ftan
-; CHECK: call double @llvm.experimental.constrained.tan
-define double @ftan() #0 {
-entry:
-  %result = call double @llvm.experimental.constrained.tan.f64(double 42.0,
-                                               metadata !"round.dynamic",
-                                               metadata !"fpexcept.strict") #0
-  ret double %result
-}
-
 ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f10
 ; CHECK: call double @llvm.experimental.constrained.exp

From 1cff74130f30aaf47a995c5f6b637a04eaab2617 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 29 May 2024 20:59:34 +0100
Subject: [PATCH 166/230] [RISCV] Merge RISCVCoalesceVSETVLI back into
 RISCVInsertVSETVLI (#92869)

We no longer need to separate the passes now that #70549 is landed and
this will unblock #89089.

It's not strictly NFC because it will move coalescing before register
allocation when -riscv-vsetvl-after-rvv-regalloc is disabled. But this
makes it closer to the original behaviour.
---
 llvm/lib/Target/RISCV/RISCV.h                 |  3 -
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  | 83 ++++---------------
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |  3 -
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |  1 -
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |  1 -
 .../CodeGen/RISCV/rvv/coalesce-vsetvli.mir    | 66 ---------------
 .../test/CodeGen/RISCV/rvv/vsetvli-insert.mir | 79 +++++++++++++++++-
 7 files changed, 92 insertions(+), 144 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir

diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 2b8688c5de61f7..dcf4c65c44dff5 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -62,9 +62,6 @@ FunctionPass *createRISCVInsertVSETVLIPass();
 void initializeRISCVInsertVSETVLIPass(PassRegistry &);
 extern char &RISCVInsertVSETVLIID;
 
-FunctionPass *createRISCVCoalesceVSETVLIPass();
-void initializeRISCVCoalesceVSETVLIPass(PassRegistry &);
-
 FunctionPass *createRISCVPostRAExpandPseudoPass();
 void initializeRISCVPostRAExpandPseudoPass(PassRegistry &);
 FunctionPass *createRISCVInsertReadWriteCSRPass();
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 2c0a807e446856..1a4f34b2d2215a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -378,10 +378,10 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
 
 /// Return the fields and properties demanded by the provided instruction.
 DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
-  // This function works in RISCVCoalesceVSETVLI too. We can still use the value
-  // of a SEW, VL, or Policy operand even though it might not be the exact value
-  // in the VL or VTYPE, since we only care about what the instruction
-  // originally demanded.
+  // This function works in coalesceVSETVLI too. We can still use the value of a
+  // SEW, VL, or Policy operand even though it might not be the exact value in
+  // the VL or VTYPE, since we only care about what the instruction originally
+  // demanded.
 
   // Most instructions don't use any of these subfeilds.
   DemandedFields Res;
@@ -900,36 +900,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   void emitVSETVLIs(MachineBasicBlock &MBB);
   void doPRE(MachineBasicBlock &MBB);
   void insertReadVL(MachineBasicBlock &MBB);
-};
-
-class RISCVCoalesceVSETVLI : public MachineFunctionPass {
-public:
-  static char ID;
-  const RISCVSubtarget *ST;
-  const TargetInstrInfo *TII;
-  MachineRegisterInfo *MRI;
-  LiveIntervals *LIS;
-
-  RISCVCoalesceVSETVLI() : MachineFunctionPass(ID) {}
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-
-    AU.addRequired<LiveIntervals>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addRequired<SlotIndexes>();
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveDebugVariables>();
-    AU.addPreserved<LiveStacks>();
-
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  StringRef getPassName() const override { return RISCV_COALESCE_VSETVLI_NAME; }
-
-private:
-  bool coalesceVSETVLIs(MachineBasicBlock &MBB);
+  void coalesceVSETVLIs(MachineBasicBlock &MBB) const;
 };
 
 } // end anonymous namespace
@@ -940,11 +911,6 @@ char &llvm::RISCVInsertVSETVLIID = RISCVInsertVSETVLI::ID;
 INITIALIZE_PASS(RISCVInsertVSETVLI, DEBUG_TYPE, RISCV_INSERT_VSETVLI_NAME,
                 false, false)
 
-char RISCVCoalesceVSETVLI::ID = 0;
-
-INITIALIZE_PASS(RISCVCoalesceVSETVLI, "riscv-coalesce-vsetvli",
-                RISCV_COALESCE_VSETVLI_NAME, false, false)
-
 // Return a VSETVLIInfo representing the changes made by this VSETVLI or
 // VSETIVLI instruction.
 static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
@@ -1650,7 +1616,7 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
   return areCompatibleVTYPEs(PriorVType, VType, Used);
 }
 
-bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
+void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   MachineInstr *NextMI = nullptr;
   // We can have arbitrary code in successors, so VL and VTYPE
   // must be considered demanded.
@@ -1742,8 +1708,6 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
     LIS->RemoveMachineInstrFromMaps(*MI);
     MI->eraseFromParent();
   }
-
-  return !ToDelete.empty();
 }
 
 void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
@@ -1833,6 +1797,15 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     insertReadVL(MBB);
 
+  // Now that all vsetvlis are explicit, go through and do block local
+  // DSE and peephole based demanded fields based transforms.  Note that
+  // this *must* be done outside the main dataflow so long as we allow
+  // any cross block analysis within the dataflow.  We can't have both
+  // demanded fields based mutation and non-local analysis in the
+  // dataflow at the same time without introducing inconsistencies.
+  for (MachineBasicBlock &MBB : MF)
+    coalesceVSETVLIs(MBB);
+
   BlockInfo.clear();
   return HaveVectorOp;
 }
@@ -1841,29 +1814,3 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
 FunctionPass *llvm::createRISCVInsertVSETVLIPass() {
   return new RISCVInsertVSETVLI();
 }
-
-// Now that all vsetvlis are explicit, go through and do block local
-// DSE and peephole based demanded fields based transforms.  Note that
-// this *must* be done outside the main dataflow so long as we allow
-// any cross block analysis within the dataflow.  We can't have both
-// demanded fields based mutation and non-local analysis in the
-// dataflow at the same time without introducing inconsistencies.
-bool RISCVCoalesceVSETVLI::runOnMachineFunction(MachineFunction &MF) {
-  // Skip if the vector extension is not enabled.
-  ST = &MF.getSubtarget<RISCVSubtarget>();
-  if (!ST->hasVInstructions())
-    return false;
-  TII = ST->getInstrInfo();
-  MRI = &MF.getRegInfo();
-  LIS = &getAnalysis<LiveIntervals>();
-
-  bool Changed = false;
-  for (MachineBasicBlock &MBB : MF)
-    Changed |= coalesceVSETVLIs(MBB);
-
-  return Changed;
-}
-
-FunctionPass *llvm::createRISCVCoalesceVSETVLIPass() {
-  return new RISCVCoalesceVSETVLI();
-}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index d9f8222669cab3..87ae2ee0d3791e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -121,7 +121,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVFoldMasksPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
-  initializeRISCVCoalesceVSETVLIPass(*PR);
   initializeRISCVInsertReadWriteCSRPass(*PR);
   initializeRISCVInsertWriteVXRMPass(*PR);
   initializeRISCVDAGToDAGISelPass(*PR);
@@ -396,7 +395,6 @@ bool RISCVPassConfig::addRegAssignAndRewriteFast() {
   addPass(createRVVRegAllocPass(false));
   if (EnableVSETVLIAfterRVVRegAlloc)
     addPass(createRISCVInsertVSETVLIPass());
-  addPass(createRISCVCoalesceVSETVLIPass());
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
       EnableRISCVDeadRegisterElimination)
     addPass(createRISCVDeadRegisterDefinitionsPass());
@@ -408,7 +406,6 @@ bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
   addPass(createVirtRegRewriter(false));
   if (EnableVSETVLIAfterRVVRegAlloc)
     addPass(createRISCVInsertVSETVLIPass());
-  addPass(createRISCVCoalesceVSETVLIPass());
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
       EnableRISCVDeadRegisterElimination)
     addPass(createRISCVDeadRegisterDefinitionsPass());
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index e4abc93d1a8a19..ef7a8f2c7bbee5 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -50,7 +50,6 @@
 ; CHECK-NEXT:       Slot index numbering
 ; CHECK-NEXT:       Live Interval Analysis
 ; CHECK-NEXT:       RISC-V Insert VSETVLI pass
-; CHECK-NEXT:       RISC-V Coalesce VSETVLI pass
 ; CHECK-NEXT:       Fast Register Allocator
 ; CHECK-NEXT:       Remove Redundant DEBUG_VALUE analysis
 ; CHECK-NEXT:       Fixup Statepoint Caller Saved
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 0528b00d408b20..1d1c5942aa8e96 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -142,7 +142,6 @@
 ; CHECK-NEXT:       Greedy Register Allocator
 ; CHECK-NEXT:       Virtual Register Rewriter
 ; CHECK-NEXT:       RISC-V Insert VSETVLI pass
-; CHECK-NEXT:       RISC-V Coalesce VSETVLI pass
 ; CHECK-NEXT:       RISC-V Dead register definitions
 ; CHECK-NEXT:       Virtual Register Map
 ; CHECK-NEXT:       Live Register Matrix
diff --git a/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir b/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir
deleted file mode 100644
index f888534ebc035d..00000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/coalesce-vsetvli.mir
+++ /dev/null
@@ -1,66 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc %s -o - -mtriple=riscv64 -mattr=v -run-pass=riscv-coalesce-vsetvli -verify-machineinstrs | FileCheck %s
-
----
-name: dead_avl_addi
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: dead_avl_addi
-    ; CHECK: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
-    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
-    ; CHECK-NEXT: PseudoRET
-    %avl:gprnox0 = ADDI $x0, 42
-    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
-    %x:gpr = PseudoVMV_X_S $noreg, 6
-    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
-    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
-    PseudoRET
-...
----
-name: dead_avl_nonvolatile_load
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $x1
-    ; CHECK-LABEL: name: dead_avl_nonvolatile_load
-    ; CHECK: liveins: $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %ptr:gpr = COPY $x1
-    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (dereferenceable load (s32))
-    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
-    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
-    ; CHECK-NEXT: PseudoRET
-    %ptr:gpr = COPY $x1
-    %avl:gprnox0 = LW killed %ptr, 0 :: (dereferenceable load (s32))
-    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
-    %x:gpr = PseudoVMV_X_S $noreg, 6
-    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
-    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
-    PseudoRET
-...
----
-name: dead_avl_volatile_load
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $x1
-    ; CHECK-LABEL: name: dead_avl_volatile_load
-    ; CHECK: liveins: $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %ptr:gpr = COPY $x1
-    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (volatile dereferenceable load (s32))
-    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */
-    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */
-    ; CHECK-NEXT: PseudoRET
-    %ptr:gpr = COPY $x1
-    %avl:gprnox0 = LW killed %ptr, 0 :: (volatile dereferenceable load (s32))
-    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
-    %x:gpr = PseudoVMV_X_S $noreg, 6
-    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
-    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
-    PseudoRET
-...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index 41a68ef9903e87..a4b374c8bb401e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc %s -o - -mtriple=riscv64 -mattr=v \
-# RUN:     -run-pass=riscv-insert-vsetvli,riscv-coalesce-vsetvli | FileCheck %s
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=v -run-pass=riscv-insert-vsetvli \
+# RUN:     | FileCheck %s
 
 --- |
   source_filename = "vsetvli-insert.ll"
@@ -80,6 +80,18 @@
     ret void
   }
 
+  define void @coalesce_dead_avl_addi() {
+    ret void
+  }
+
+  define void @coalesce_dead_avl_nonvolatile_load() {
+    ret void
+  }
+
+  define void @coalesce_dead_avl_volatile_load() {
+    ret void
+  }
+
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
 
   declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4
@@ -501,3 +513,66 @@ body:             |
     %4:vr = PseudoVADD_VV_M1 undef $noreg, undef $noreg, undef $noreg, 3, 6, 0
     PseudoRET
 ...
+---
+name: coalesce_dead_avl_addi
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_dead_avl_addi
+    ; CHECK: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */, implicit $vtype
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %avl:gprnox0 = ADDI $x0, 42
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
+---
+name: coalesce_dead_avl_nonvolatile_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: coalesce_dead_avl_nonvolatile_load
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:gpr = COPY $x1
+    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (dereferenceable load (s32))
+    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */, implicit $vtype
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %ptr:gpr = COPY $x1
+    %avl:gprnox0 = LW killed %ptr, 0 :: (dereferenceable load (s32))
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
+---
+name: coalesce_dead_avl_volatile_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: coalesce_dead_avl_volatile_load
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:gpr = COPY $x1
+    ; CHECK-NEXT: dead %avl:gprnox0 = LW %ptr, 0 :: (volatile dereferenceable load (s32))
+    ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S $noreg, 6 /* e64 */, implicit $vtype
+    ; CHECK-NEXT: $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %ptr:gpr = COPY $x1
+    %avl:gprnox0 = LW killed %ptr, 0 :: (volatile dereferenceable load (s32))
+    dead $x0 = PseudoVSETVLI killed %avl, 216, implicit-def $vl, implicit-def $vtype
+    %x:gpr = PseudoVMV_X_S $noreg, 6
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    $v0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...

From ec8fe598a94d2826f8e4f79367a5a45a6b32d284 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 13:00:15 -0700
Subject: [PATCH 167/230] [RISCV] Move vnclipu patterns into DAGCombiner.
 (#93596)

I plan to add support for multiple layers of vnclipu. For example,
i32->i8 using 2 vnclipu instructions. First clipping to 65535, then
clipping to 255. Similar for signed vnclip.

This scales poorly if we need to add patterns with 2 or 3 truncates.
Instead, move the code to DAGCombiner with new ISD opcodes to represent
VCLIP(U).

This patch just moves the existing patterns into DAG combine. Support
for multiple truncates will as a follow up. A similar patch series will
be made for the signed vnclip.
---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   9 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  70 +++++++++++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   4 +
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |   7 --
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 104 ++++++++++++++++--
 5 files changed, 174 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 08f056f78979af..550904516ac8e8 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -373,6 +373,15 @@ inline static bool isValidRoundingMode(unsigned Mode) {
 }
 } // namespace RISCVFPRndMode
 
+namespace RISCVVXRndMode {
+enum RoundingMode {
+  RNU = 0,
+  RNE = 1,
+  RDN = 2,
+  ROD = 3,
+};
+} // namespace RISCVVXRndMode
+
 //===----------------------------------------------------------------------===//
 // Floating-point Immediates
 //
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f4da46f82a8108..0242cfe1785246 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5960,7 +5960,7 @@ static bool hasMergeOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
-                    128 &&
+                    130 &&
                 RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
                         ISD::FIRST_TARGET_STRICTFP_OPCODE ==
                     21 &&
@@ -5986,7 +5986,7 @@ static bool hasMaskOp(unsigned Opcode) {
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
   static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
-                    128 &&
+                    130 &&
                 RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
                         ISD::FIRST_TARGET_STRICTFP_OPCODE ==
                     21 &&
@@ -16183,6 +16183,66 @@ static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
 }
 
+// Combine (truncate_vector_vl (umin X, C)) -> (vnclipu_vl X) if C is maximum
+// value for the truncated type.
+static SDValue combineTruncToVnclipu(SDNode *N, SelectionDAG &DAG,
+                                     const RISCVSubtarget &Subtarget) {
+  assert(N->getOpcode() == RISCVISD::TRUNCATE_VECTOR_VL);
+
+  MVT VT = N->getSimpleValueType(0);
+
+  SDValue Mask = N->getOperand(1);
+  SDValue VL = N->getOperand(2);
+
+  SDValue Src = N->getOperand(0);
+
+  // Src must be a UMIN or UMIN_VL.
+  if (Src.getOpcode() != ISD::UMIN &&
+      !(Src.getOpcode() == RISCVISD::UMIN_VL && Src.getOperand(2).isUndef() &&
+        Src.getOperand(3) == Mask && Src.getOperand(4) == VL))
+    return SDValue();
+
+  auto IsSplat = [&VL](SDValue Op, APInt &SplatVal) {
+    // Peek through conversion between fixed and scalable vectors.
+    if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
+        isNullConstant(Op.getOperand(2)) &&
+        Op.getOperand(1).getValueType().isFixedLengthVector() &&
+        Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        Op.getOperand(1).getOperand(0).getValueType() == Op.getValueType() &&
+        isNullConstant(Op.getOperand(1).getOperand(1)))
+      Op = Op.getOperand(1).getOperand(0);
+
+    if (ISD::isConstantSplatVector(Op.getNode(), SplatVal))
+      return true;
+
+    if (Op.getOpcode() == RISCVISD::VMV_V_X_VL && Op.getOperand(0).isUndef() &&
+        Op.getOperand(2) == VL) {
+      if (auto *Op1 = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+        SplatVal =
+            Op1->getAPIntValue().sextOrTrunc(Op.getScalarValueSizeInBits());
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  APInt C;
+  if (!IsSplat(Src.getOperand(1), C))
+    return SDValue();
+
+  if (!C.isMask(VT.getScalarSizeInBits()))
+    return SDValue();
+
+  SDLoc DL(N);
+  // Rounding mode here is arbitrary since we aren't shifting out any bits.
+  return DAG.getNode(
+      RISCVISD::VNCLIPU_VL, DL, VT,
+      {Src.getOperand(0), DAG.getConstant(0, DL, VT), DAG.getUNDEF(VT), Mask,
+       DAG.getTargetConstant(RISCVVXRndMode::RNU, DL, Subtarget.getXLenVT()),
+       VL});
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -16400,7 +16460,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     }
     return SDValue();
   case RISCVISD::TRUNCATE_VECTOR_VL:
-    return combineTruncOfSraSext(N, DAG);
+    if (SDValue V = combineTruncOfSraSext(N, DAG))
+      return V;
+    return combineTruncToVnclipu(N, DAG, Subtarget);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:
@@ -20019,6 +20081,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(UADDSAT_VL)
   NODE_NAME_CASE(SSUBSAT_VL)
   NODE_NAME_CASE(USUBSAT_VL)
+  NODE_NAME_CASE(VNCLIP_VL)
+  NODE_NAME_CASE(VNCLIPU_VL)
   NODE_NAME_CASE(FADD_VL)
   NODE_NAME_CASE(FSUB_VL)
   NODE_NAME_CASE(FMUL_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 856ce06ba1c4f4..3b8eb3c88901a3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -273,6 +273,10 @@ enum NodeType : unsigned {
   // Rounding averaging adds of unsigned integers.
   AVGCEILU_VL,
 
+  // Operands are (source, shift, merge, mask, roundmode, vl)
+  VNCLIPU_VL,
+  VNCLIP_VL,
+
   MULHS_VL,
   MULHU_VL,
   FADD_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 66df24f2a458db..691f2052ab29d8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1196,13 +1196,6 @@ multiclass VPatTruncSatClipSDNode<VTypeInfo vti, VTypeInfo wti> {
       (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
         (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
         (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (umin (wti.Vector wti.RegClass:$rs1),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), uminval, (XLenVT srcvalue))))), (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 91f3abe22331ec..610a72dd02b388 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -36,6 +36,18 @@ def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameNumEltsAs<0, 4>,
                                                 SDTCisVT<5, XLenVT>]>;
 
+// Input: (vector, vector/scalar, merge, mask, roundmode, vl)
+def SDT_RISCVVNBinOp_RM_VL : SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisInt<0>,
+                                                  SDTCisSameAs<0, 3>,
+                                                  SDTCisSameNumEltsAs<0, 1>,
+                                                  SDTCisVec<1>,
+                                                  SDTCisOpSmallerThanOp<2, 1>,
+                                                  SDTCisSameAs<0, 2>,
+                                                  SDTCisSameNumEltsAs<0, 4>,
+                                                  SDTCVecEltisVT<4, i1>,
+                                                  SDTCisVT<5, XLenVT>,
+                                                  SDTCisVT<6, XLenVT>]>;
+
 def SDT_RISCVFPUnOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                               SDTCisVec<0>, SDTCisFP<0>,
                                               SDTCVecEltisVT<2, i1>,
@@ -120,6 +132,9 @@ def riscv_uaddsat_vl   : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [S
 def riscv_ssubsat_vl   : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>;
 def riscv_usubsat_vl   : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>;
 
+def riscv_vnclipu_vl : SDNode<"RISCVISD::VNCLIPU_VL", SDT_RISCVVNBinOp_RM_VL>;
+def riscv_vnclip_vl : SDNode<"RISCVISD::VNCLIP_VL", SDT_RISCVVNBinOp_RM_VL>;
+
 def riscv_fadd_vl  : SDNode<"RISCVISD::FADD_VL",  SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
 def riscv_fsub_vl  : SDNode<"RISCVISD::FSUB_VL",  SDT_RISCVFPBinOp_VL>;
 def riscv_fmul_vl  : SDNode<"RISCVISD::FMUL_VL",  SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
@@ -635,6 +650,34 @@ class VPatBinaryVL_V<SDPatternOperator vop,
                    op2_reg_class:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
+multiclass VPatBinaryRM_VL_V<SDNode vop,
+                             string instruction_name,
+                             string suffix,
+                             ValueType result_type,
+                             ValueType op1_type,
+                             ValueType op2_type,
+                             ValueType mask_type,
+                             int sew,
+                             LMULInfo vlmul,
+                             VReg result_reg_class,
+                             VReg op1_reg_class,
+                             VReg op2_reg_class> {
+  def : Pat<(result_type (vop
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_reg_class:$rs2),
+                         (result_type result_reg_class:$merge),
+                         (mask_type V0),
+                         (XLenVT timm:$roundmode),
+                         VLOpFrag)),
+        (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK")
+                     result_reg_class:$merge,
+                     op1_reg_class:$rs1,
+                     op2_reg_class:$rs2,
+                     (mask_type V0),
+                     (XLenVT timm:$roundmode),
+                     GPR:$vl, sew, TAIL_AGNOSTIC)>;
+}
+
 class VPatBinaryVL_V_RM<SDPatternOperator vop,
                         string instruction_name,
                         string suffix,
@@ -795,6 +838,35 @@ class VPatBinaryVL_XI<SDPatternOperator vop,
                    xop_kind:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
 
+multiclass VPatBinaryRM_VL_XI<SDNode vop,
+                              string instruction_name,
+                              string suffix,
+                              ValueType result_type,
+                              ValueType vop1_type,
+                              ValueType vop2_type,
+                              ValueType mask_type,
+                              int sew,
+                              LMULInfo vlmul,
+                              VReg result_reg_class,
+                              VReg vop_reg_class,
+                              ComplexPattern SplatPatKind,
+                              DAGOperand xop_kind> {
+  def : Pat<(result_type (vop
+                     (vop1_type vop_reg_class:$rs1),
+                     (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
+                     (result_type result_reg_class:$merge),
+                     (mask_type V0),
+                     (XLenVT timm:$roundmode),
+                     VLOpFrag)),
+        (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX#"_MASK")
+                     result_reg_class:$merge,
+                     vop_reg_class:$rs1,
+                     xop_kind:$rs2,
+                     (mask_type V0),
+                     (XLenVT timm:$roundmode),
+                     GPR:$vl, sew, TAIL_AGNOSTIC)>;
+}
+
 multiclass VPatBinaryVL_VV_VX<SDPatternOperator vop, string instruction_name,
                               list<VTypeInfo> vtilist = AllIntegerVectors,
                               bit isSEWAware = 0> {
@@ -893,6 +965,24 @@ multiclass VPatBinaryNVL_WV_WX_WI<SDPatternOperator vop, string instruction_name
   }
 }
 
+multiclass VPatBinaryRM_NVL_WV_WX_WI<SDNode vop, string instruction_name> {
+  foreach VtiToWti = AllWidenableIntVectors in {
+    defvar vti = VtiToWti.Vti;
+    defvar wti = VtiToWti.Wti;
+    defm : VPatBinaryRM_VL_V<vop, instruction_name, "WV",
+                             vti.Vector, wti.Vector, vti.Vector, vti.Mask,
+                             vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass, vti.RegClass>;
+    defm : VPatBinaryRM_VL_XI<vop, instruction_name, "WX",
+                              vti.Vector, wti.Vector, vti.Vector, vti.Mask,
+                              vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass, SplatPat, GPR>;
+    defm : VPatBinaryRM_VL_XI<vop, instruction_name, "WI",
+                              vti.Vector, wti.Vector, vti.Vector, vti.Mask,
+                              vti.Log2SEW, vti.LMul, vti.RegClass, wti.RegClass,
+                              !cast<ComplexPattern>(SplatPat#_#uimm5),
+                              uimm5>;
+  }
+}
+
 class VPatBinaryVL_VF<SDPatternOperator vop,
                       string instruction_name,
                       ValueType result_type,
@@ -2376,6 +2466,10 @@ defm : VPatAVGADDVL_VV_VX_RM<riscv_avgflooru_vl, 0b10, suffix="U">;
 defm : VPatAVGADDVL_VV_VX_RM<riscv_avgceils_vl, 0b00>;
 defm : VPatAVGADDVL_VV_VX_RM<riscv_avgceilu_vl, 0b00, suffix="U">;
 
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclip_vl, "PseudoVNCLIP">;
+defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclipu_vl, "PseudoVNCLIPU">;
+
 // 12.5. Vector Narrowing Fixed-Point Clip Instructions
 multiclass VPatTruncSatClipVL<VTypeInfo vti, VTypeInfo wti> {
   defvar sew = vti.SEW;
@@ -2410,16 +2504,6 @@ multiclass VPatTruncSatClipVL<VTypeInfo vti, VTypeInfo wti> {
       (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
         (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
         (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (riscv_umin_vl
-          (wti.Vector wti.RegClass:$rs1),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), uminval, (XLenVT srcvalue))),
-          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
-        (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
   }
 }
 

From 9c4a716c1292096fcbdf415b63b7b0122b03310f Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 29 May 2024 17:02:15 -0300
Subject: [PATCH 168/230] [clang] Preserve Qualifiers and type sugar in
 TemplateNames (#93433)

This patch improves the preservation of qualifiers and loss of type
sugar in TemplateNames.

This problem is analogous to https://reviews.llvm.org/D112374 and this
patch takes a very similar approach to that patch, except the impact
here is much lesser.

When a TemplateName was written bare, without qualifications, we
wouldn't produce a QualifiedTemplate which could be used to disambiguate
it from a Canonical TemplateName. This had effects in the TemplateName
printer, which had workarounds to deal with this, and wouldn't print the
TemplateName as-written in most situations.

There are also some related fixes to help preserve this type sugar along
the way into diagnostics, so that this patch can be properly tested.

- Fix dropping the template keyword.
- Fix type deduction to preserve sugar in TST TemplateNames.
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 clang/include/clang/AST/TemplateName.h        | 19 +++---
 clang/include/clang/Sema/Sema.h               |  3 +
 clang/lib/AST/ASTContext.cpp                  | 16 ++---
 clang/lib/AST/DeclTemplate.cpp                |  7 +-
 clang/lib/AST/ODRHash.cpp                     |  9 ++-
 clang/lib/AST/TemplateBase.cpp                |  2 +-
 clang/lib/AST/TemplateName.cpp                | 64 +++++++++----------
 clang/lib/AST/TextNodeDumper.cpp              |  4 +-
 clang/lib/AST/Type.cpp                        |  3 +-
 clang/lib/AST/TypePrinter.cpp                 |  4 +-
 clang/lib/Sema/SemaDecl.cpp                   | 15 ++---
 clang/lib/Sema/SemaDeclCXX.cpp                | 12 ++--
 clang/lib/Sema/SemaExpr.cpp                   |  4 +-
 clang/lib/Sema/SemaExprMember.cpp             |  3 +-
 clang/lib/Sema/SemaTemplate.cpp               | 25 +++++---
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 62 +++++++++++++-----
 clang/lib/Sema/SemaType.cpp                   | 14 ++--
 clang/lib/Sema/TreeTransform.h                |  8 +--
 clang/test/AST/ast-dump-ctad-alias.cpp        |  6 +-
 clang/test/AST/ast-dump-decl.cpp              |  8 +--
 clang/test/AST/ast-dump-expr.cpp              |  2 +-
 clang/test/AST/ast-dump-template-decls.cpp    |  6 +-
 clang/test/AST/ast-dump-template-name.cpp     |  4 +-
 clang/test/AST/ast-dump-using-template.cpp    |  6 +-
 clang/test/CXX/drs/cwg1xx.cpp                 |  4 +-
 .../over.match.oper/p3-2a.cpp                 |  4 +-
 .../temp.deduct/temp.deduct.type/p9-0x.cpp    |  4 +-
 clang/test/Index/print-type.cpp               |  2 +-
 clang/test/OpenMP/declare_mapper_messages.cpp |  2 +-
 .../Parser/cxx-template-template-recovery.cpp |  4 +-
 .../cxx1y-variable-templates_in_class.cpp     | 10 +--
 clang/test/SemaTemplate/cwg2398.cpp           |  2 +-
 .../instantiate-requires-expr.cpp             |  4 +-
 .../nested-implicit-deduction-guides.cpp      |  2 +-
 clang/unittests/AST/TemplateNameTest.cpp      | 40 ++++++++++--
 .../map/map.cons/deduct.verify.cpp            | 24 +++----
 .../multimap/multimap.cons/deduct.verify.cpp  | 22 +++----
 .../multiset/multiset.cons/deduct.verify.cpp  | 10 +--
 .../set/set.cons/deduct.verify.cpp            | 10 +--
 .../priqueue.cons/deduct.verify.cpp           | 10 +--
 .../queue/queue.cons/deduct.verify.cpp        |  6 +-
 .../stack/stack.cons/deduct.verify.cpp        |  6 +-
 .../array/array.cons/deduct.verify.cpp        |  2 +-
 .../deque/deque.cons/deduct.verify.cpp        |  2 +-
 .../forwardlist.cons/deduct.verify.cpp        |  2 +-
 .../list/list.cons/deduct.verify.cpp          |  2 +-
 .../vector/vector.cons/deduct.verify.cpp      |  2 +-
 .../unord.map.cnstr/deduct.verify.cpp         | 16 ++---
 .../unord.multimap.cnstr/deduct.verify.cpp    | 16 ++---
 .../unord.multiset.cnstr/deduct.verify.cpp    | 16 ++---
 .../unord.set.cnstr/deduct.verify.cpp         | 16 ++---
 .../range.adaptors/range.join/ctad.verify.cpp |  2 +-
 .../re.regex.construct/deduct.verify.cpp      |  4 +-
 .../optional.object.ctor/deduct.verify.cpp    |  2 +-
 55 files changed, 316 insertions(+), 240 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e1c6d55eeeacdf..44035f48cb3f97 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -819,6 +819,8 @@ Bug Fixes to AST Handling
 - Clang now properly preserves ``FoundDecls`` within a ``ConceptReference``. (#GH82628)
 - The presence of the ``typename`` keyword is now stored in ``TemplateTemplateParmDecl``.
 - Fixed malformed AST generated for anonymous union access in templates. (#GH90842)
+- Improved preservation of qualifiers and sugar in `TemplateNames`, including
+  template keyword.
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h
index b7732e54ba1079..876be463c71d01 100644
--- a/clang/include/clang/AST/TemplateName.h
+++ b/clang/include/clang/AST/TemplateName.h
@@ -332,7 +332,7 @@ class TemplateName {
   /// unexpanded parameter pack (for C++0x variadic templates).
   bool containsUnexpandedParameterPack() const;
 
-  enum class Qualified { None, AsWritten, Fully };
+  enum class Qualified { None, AsWritten };
   /// Print the template name.
   ///
   /// \param OS the output stream to which the template name will be
@@ -417,17 +417,18 @@ inline TemplateName TemplateName::getUnderlying() const {
   return *this;
 }
 
-/// Represents a template name that was expressed as a
-/// qualified name.
+/// Represents a template name as written in source code.
 ///
-/// This kind of template name refers to a template name that was
+/// This kind of template name may refer to a template name that was
 /// preceded by a nested name specifier, e.g., \c std::vector. Here,
 /// the nested name specifier is "std::" and the template name is the
-/// declaration for "vector". The QualifiedTemplateName class is only
-/// used to provide "sugar" for template names that were expressed
-/// with a qualified name, and has no semantic meaning. In this
-/// manner, it is to TemplateName what ElaboratedType is to Type,
-/// providing extra syntactic sugar for downstream clients.
+/// declaration for "vector". It may also have been written with the
+/// 'template' keyword. The QualifiedTemplateName class is only
+/// used to provide "sugar" for template names, so that they can
+/// be differentiated from canonical template names. and has no
+/// semantic meaning. In this manner, it is to TemplateName what
+/// ElaboratedType is to Type, providing extra syntactic sugar
+/// for downstream clients.
 class QualifiedTemplateName : public llvm::FoldingSetNode {
   friend class ASTContext;
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index ec083f7cc09b7c..e6296868000c5f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8988,6 +8988,9 @@ class Sema final : public SemaBase {
                          const TemplateArgumentListInfo *TemplateArgs);
 
   void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc);
+  void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
+                                        bool TemplateKeyword, TemplateDecl *TD,
+                                        SourceLocation Loc);
 
   ExprResult BuildTemplateIdExpr(const CXXScopeSpec &SS,
                                  SourceLocation TemplateKWLoc, LookupResult &R,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a2398fef623ea2..06780ceba40746 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5006,9 +5006,6 @@ ASTContext::getTemplateSpecializationType(TemplateName Template,
                                           QualType Underlying) const {
   assert(!Template.getAsDependentTemplateName() &&
          "No dependent template names here!");
-  // Look through qualified template names.
-  if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
-    Template = QTN->getUnderlyingTemplate();
 
   const auto *TD = Template.getAsTemplateDecl();
   bool IsTypeAlias = TD && TD->isTypeAlias();
@@ -5044,10 +5041,6 @@ QualType ASTContext::getCanonicalTemplateSpecializationType(
   assert(!Template.getAsDependentTemplateName() &&
          "No dependent template names here!");
 
-  // Look through qualified template names.
-  if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
-    Template = TemplateName(QTN->getUnderlyingTemplate());
-
   // Build the canonical template specialization type.
   TemplateName CanonTemplate = getCanonicalTemplateName(Template);
   bool AnyNonCanonArgs = false;
@@ -5262,10 +5255,12 @@ TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) {
     Arg = TemplateArgument(E);
   } else {
     auto *TTP = cast<TemplateTemplateParmDecl>(Param);
+    TemplateName Name = getQualifiedTemplateName(
+        nullptr, /*TemplateKeyword=*/false, TemplateName(TTP));
     if (TTP->isParameterPack())
-      Arg = TemplateArgument(TemplateName(TTP), std::optional<unsigned>());
+      Arg = TemplateArgument(Name, std::optional<unsigned>());
     else
-      Arg = TemplateArgument(TemplateName(TTP));
+      Arg = TemplateArgument(Name);
   }
 
   if (Param->isTemplateParameterPack())
@@ -9304,7 +9299,8 @@ TemplateName ASTContext::getAssumedTemplateName(DeclarationName Name) const {
 TemplateName ASTContext::getQualifiedTemplateName(NestedNameSpecifier *NNS,
                                                   bool TemplateKeyword,
                                                   TemplateName Template) const {
-  assert(NNS && "Missing nested-name-specifier in qualified template name");
+  assert(Template.getKind() == TemplateName::Template ||
+         Template.getKind() == TemplateName::UsingTemplate);
 
   // FIXME: Canonicalization?
   llvm::FoldingSetNodeID ID;
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 95ffd4784641fc..d952f7e181848b 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -627,9 +627,10 @@ ClassTemplateDecl::getInjectedClassNameSpecialization() {
   TemplateParameterList *Params = getTemplateParameters();
   SmallVector<TemplateArgument, 16> TemplateArgs;
   Context.getInjectedTemplateArgs(Params, TemplateArgs);
-  CommonPtr->InjectedClassNameType
-    = Context.getTemplateSpecializationType(TemplateName(this),
-                                            TemplateArgs);
+  TemplateName Name = Context.getQualifiedTemplateName(
+      /*NNS=*/nullptr, /*TemplateKeyword=*/false, TemplateName(this));
+  CommonPtr->InjectedClassNameType =
+      Context.getTemplateSpecializationType(Name, TemplateArgs);
   return CommonPtr->InjectedClassNameType;
 }
 
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 246e56231539ae..1249531eab09fc 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -146,10 +146,17 @@ void ODRHash::AddTemplateName(TemplateName Name) {
   case TemplateName::Template:
     AddDecl(Name.getAsTemplateDecl());
     break;
+  case TemplateName::QualifiedTemplate: {
+    QualifiedTemplateName *QTN = Name.getAsQualifiedTemplateName();
+    if (NestedNameSpecifier *NNS = QTN->getQualifier())
+      AddNestedNameSpecifier(NNS);
+    AddBoolean(QTN->hasTemplateKeyword());
+    AddTemplateName(QTN->getUnderlyingTemplate());
+    break;
+  }
   // TODO: Support these cases.
   case TemplateName::OverloadedTemplate:
   case TemplateName::AssumedTemplate:
-  case TemplateName::QualifiedTemplate:
   case TemplateName::DependentTemplate:
   case TemplateName::SubstTemplateTemplateParm:
   case TemplateName::SubstTemplateTemplateParmPack:
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index b50daf5fbed6a7..6d3c843cfd29e0 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -552,7 +552,7 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
       const auto *TTP = cast<TemplateTemplateParmDecl>(TD);
       Out << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex();
     } else {
-      TN.print(Out, Policy, TemplateName::Qualified::Fully);
+      TN.print(Out, Policy);
     }
     break;
   }
diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp
index 2f0e4181e94086..3aae998eceeb05 100644
--- a/clang/lib/AST/TemplateName.cpp
+++ b/clang/lib/AST/TemplateName.cpp
@@ -235,8 +235,8 @@ TemplateNameDependence TemplateName::getDependence() const {
   auto D = TemplateNameDependence::None;
   switch (getKind()) {
   case TemplateName::NameKind::QualifiedTemplate:
-    D |= toTemplateNameDependence(
-        getAsQualifiedTemplateName()->getQualifier()->getDependence());
+    if (NestedNameSpecifier *NNS = getAsQualifiedTemplateName()->getQualifier())
+      D |= toTemplateNameDependence(NNS->getDependence());
     break;
   case TemplateName::NameKind::DependentTemplate:
     D |= toTemplateNameDependence(
@@ -292,9 +292,8 @@ void TemplateName::Profile(llvm::FoldingSetNodeID &ID) {
 
 void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy,
                          Qualified Qual) const {
-  auto Kind = getKind();
-  TemplateDecl *Template = nullptr;
-  if (Kind == TemplateName::Template || Kind == TemplateName::UsingTemplate) {
+  if (NameKind Kind = getKind();
+      Kind == TemplateName::Template || Kind == TemplateName::UsingTemplate) {
     // After `namespace ns { using std::vector }`, what is the fully-qualified
     // name of the UsingTemplateName `vector` within ns?
     //
@@ -304,46 +303,43 @@ void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy,
     // Similar to the UsingType behavior, using declarations are used to import
     // names more often than to export them, thus using the original name is
     // most useful in this case.
-    Template = getAsTemplateDecl();
-  }
-
-  if (Template)
-    if (Policy.CleanUglifiedParameters &&
-        isa<TemplateTemplateParmDecl>(Template) && Template->getIdentifier())
-      OS << Template->getIdentifier()->deuglifiedName();
-    else if (Qual == Qualified::Fully &&
-             getDependence() !=
-                 TemplateNameDependenceScope::DependentInstantiation)
-      Template->printQualifiedName(OS, Policy);
-    else
+    TemplateDecl *Template = getAsTemplateDecl();
+    if (Qual == Qualified::None)
       OS << *Template;
-  else if (QualifiedTemplateName *QTN = getAsQualifiedTemplateName()) {
-    if (Qual == Qualified::Fully &&
-        getDependence() !=
-            TemplateNameDependenceScope::DependentInstantiation) {
-      QTN->getUnderlyingTemplate().getAsTemplateDecl()->printQualifiedName(
-          OS, Policy);
-      return;
-    }
-    if (Qual == Qualified::AsWritten)
-      QTN->getQualifier()->print(OS, Policy);
+    else
+      Template->printQualifiedName(OS, Policy);
+  } else if (QualifiedTemplateName *QTN = getAsQualifiedTemplateName()) {
+    if (NestedNameSpecifier *NNS = QTN->getQualifier();
+        Qual != Qualified::None && NNS)
+      NNS->print(OS, Policy);
     if (QTN->hasTemplateKeyword())
       OS << "template ";
-    OS << *QTN->getUnderlyingTemplate().getAsTemplateDecl();
+
+    TemplateName Underlying = QTN->getUnderlyingTemplate();
+    assert(Underlying.getKind() == TemplateName::Template ||
+           Underlying.getKind() == TemplateName::UsingTemplate);
+
+    TemplateDecl *UTD = Underlying.getAsTemplateDecl();
+    if (IdentifierInfo *II = UTD->getIdentifier();
+        Policy.CleanUglifiedParameters && II &&
+        isa<TemplateTemplateParmDecl>(UTD))
+      OS << II->deuglifiedName();
+    else
+      OS << *UTD;
   } else if (DependentTemplateName *DTN = getAsDependentTemplateName()) {
-    if (Qual == Qualified::AsWritten && DTN->getQualifier())
-      DTN->getQualifier()->print(OS, Policy);
+    if (NestedNameSpecifier *NNS = DTN->getQualifier())
+      NNS->print(OS, Policy);
     OS << "template ";
 
     if (DTN->isIdentifier())
       OS << DTN->getIdentifier()->getName();
     else
       OS << "operator " << getOperatorSpelling(DTN->getOperator());
-  } else if (SubstTemplateTemplateParmStorage *subst
-               = getAsSubstTemplateTemplateParm()) {
+  } else if (SubstTemplateTemplateParmStorage *subst =
+                 getAsSubstTemplateTemplateParm()) {
     subst->getReplacement().print(OS, Policy, Qual);
-  } else if (SubstTemplateTemplateParmPackStorage *SubstPack
-                                        = getAsSubstTemplateTemplateParmPack())
+  } else if (SubstTemplateTemplateParmPackStorage *SubstPack =
+                 getAsSubstTemplateTemplateParmPack())
     OS << *SubstPack->getParameterPack();
   else if (AssumedTemplateStorage *Assumed = getAsAssumedTemplateName()) {
     Assumed->getDeclName().print(OS, Policy);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 627f8d3477d4e6..a0eedc71ea220b 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1989,7 +1989,7 @@ void TextNodeDumper::VisitAutoType(const AutoType *T) {
 
 void TextNodeDumper::VisitDeducedTemplateSpecializationType(
     const DeducedTemplateSpecializationType *T) {
-  if (T->getTemplateName().getKind() == TemplateName::UsingTemplate)
+  if (T->getTemplateName().getAsUsingShadowDecl())
     OS << " using";
 }
 
@@ -1997,7 +1997,7 @@ void TextNodeDumper::VisitTemplateSpecializationType(
     const TemplateSpecializationType *T) {
   if (T->isTypeAlias())
     OS << " alias";
-  if (T->getTemplateName().getKind() == TemplateName::UsingTemplate)
+  if (T->getTemplateName().getAsUsingShadowDecl())
     OS << " using";
   OS << " ";
   T->getTemplateName().dump(OS);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 04f105c1288721..2097b29b7e0b6d 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -4251,7 +4251,8 @@ TemplateSpecializationType::TemplateSpecializationType(
   assert((T.getKind() == TemplateName::Template ||
           T.getKind() == TemplateName::SubstTemplateTemplateParm ||
           T.getKind() == TemplateName::SubstTemplateTemplateParmPack ||
-          T.getKind() == TemplateName::UsingTemplate) &&
+          T.getKind() == TemplateName::UsingTemplate ||
+          T.getKind() == TemplateName::QualifiedTemplate) &&
          "Unexpected template name for TemplateSpecializationType");
 
   auto *TemplateArgs = reinterpret_cast<TemplateArgument *>(this + 1);
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 5ed56b367a46a7..58d01705d607b2 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1586,14 +1586,14 @@ void TypePrinter::printTemplateId(const TemplateSpecializationType *T,
   IncludeStrongLifetimeRAII Strong(Policy);
 
   TemplateDecl *TD = T->getTemplateName().getAsTemplateDecl();
-  // FIXME: Null TD never excercised in test suite.
+  // FIXME: Null TD never exercised in test suite.
   if (FullyQualify && TD) {
     if (!Policy.SuppressScope)
       AppendScope(TD->getDeclContext(), OS, TD->getDeclName());
 
     OS << TD->getName();
   } else {
-    T->getTemplateName().print(OS, Policy);
+    T->getTemplateName().print(OS, Policy, TemplateName::Qualified::None);
   }
 
   DefaultTemplateArgsPolicyRAII TemplateArgs(Policy);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 2a87b26f17a2b1..e29ddd81a3f889 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -538,8 +538,9 @@ ParsedType Sema::getTypeName(const IdentifierInfo &II, SourceLocation NameLoc,
   } else if (AllowDeducedTemplate) {
     if (auto *TD = getAsTypeTemplateDecl(IIDecl)) {
       assert(!FoundUsingShadow || FoundUsingShadow->getTargetDecl() == TD);
-      TemplateName Template =
-          FoundUsingShadow ? TemplateName(FoundUsingShadow) : TemplateName(TD);
+      TemplateName Template = Context.getQualifiedTemplateName(
+          SS ? SS->getScopeRep() : nullptr, /*TemplateKeyword=*/false,
+          FoundUsingShadow ? TemplateName(FoundUsingShadow) : TemplateName(TD));
       T = Context.getDeducedTemplateSpecializationType(Template, QualType(),
                                                        false);
       // Don't wrap in a further UsingType.
@@ -1137,12 +1138,10 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS,
           dyn_cast<UsingShadowDecl>(*Result.begin());
       assert(!FoundUsingShadow ||
              TD == cast<TemplateDecl>(FoundUsingShadow->getTargetDecl()));
-      Template =
-          FoundUsingShadow ? TemplateName(FoundUsingShadow) : TemplateName(TD);
-      if (SS.isNotEmpty())
-        Template = Context.getQualifiedTemplateName(SS.getScopeRep(),
-                                                    /*TemplateKeyword=*/false,
-                                                    Template);
+      Template = Context.getQualifiedTemplateName(
+          SS.getScopeRep(),
+          /*TemplateKeyword=*/false,
+          FoundUsingShadow ? TemplateName(FoundUsingShadow) : TemplateName(TD));
     } else {
       // All results were non-template functions. This is a function template
       // name.
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 8ab429e2a136ef..631fd4e354927f 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11547,12 +11547,12 @@ bool Sema::CheckDeductionGuideDeclarator(Declarator &D, QualType &R,
       TemplateName SpecifiedName = RetTST.getTypePtr()->getTemplateName();
       bool TemplateMatches =
           Context.hasSameTemplateName(SpecifiedName, GuidedTemplate);
-      auto TKind = SpecifiedName.getKind();
-      // A Using TemplateName can't actually be valid (either it's qualified, or
-      // we're in the wrong scope). But we have diagnosed these problems
-      // already.
-      bool SimplyWritten = TKind == TemplateName::Template ||
-                           TKind == TemplateName::UsingTemplate;
+
+      const QualifiedTemplateName *Qualifiers =
+          SpecifiedName.getAsQualifiedTemplateName();
+      assert(Qualifiers && "expected QualifiedTemplate");
+      bool SimplyWritten = !Qualifiers->hasTemplateKeyword() &&
+                           Qualifiers->getQualifier() == nullptr;
       if (SimplyWritten && TemplateMatches)
         AcceptableReturnType = true;
       else {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ded4f59833ac05..fb4154757775bc 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3284,10 +3284,10 @@ ExprResult Sema::BuildDeclarationNameExpr(
     return CreateRecoveryExpr(NameInfo.getBeginLoc(), NameInfo.getEndLoc(), {});
   }
 
-  if (TemplateDecl *Template = dyn_cast<TemplateDecl>(D)) {
+  if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D)) {
     // Specifically diagnose references to class templates that are missing
     // a template argument list.
-    diagnoseMissingTemplateArguments(TemplateName(Template), Loc);
+    diagnoseMissingTemplateArguments(SS, /*TemplateKeyword=*/false, TD, Loc);
     return ExprError();
   }
 
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 9aa60204bf29de..3ae1af26d0096f 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -1194,7 +1194,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
 
   if (VarTemplateDecl *VarTempl = dyn_cast<VarTemplateDecl>(MemberDecl)) {
     if (!TemplateArgs) {
-      diagnoseMissingTemplateArguments(TemplateName(VarTempl), MemberLoc);
+      diagnoseMissingTemplateArguments(
+          SS, /*TemplateKeyword=*/TemplateKWLoc.isValid(), VarTempl, MemberLoc);
       return ExprError();
     }
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 39e9dbed0c3e0b..3e3ed77de710e5 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -292,7 +292,7 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
     Template =
         FoundUsingShadow ? TemplateName(FoundUsingShadow) : TemplateName(TD);
     assert(!FoundUsingShadow || FoundUsingShadow->getTargetDecl() == TD);
-    if (SS.isSet() && !SS.isInvalid()) {
+    if (!SS.isInvalid()) {
       NestedNameSpecifier *Qualifier = SS.getScopeRep();
       Template = Context.getQualifiedTemplateName(Qualifier, hasTemplateKeyword,
                                                   Template);
@@ -342,8 +342,11 @@ bool Sema::isDeductionGuideName(Scope *S, const IdentifierInfo &Name,
   if (!TD || !getAsTypeTemplateDecl(TD))
     return false;
 
-  if (Template)
-    *Template = TemplateTy::make(TemplateName(TD));
+  if (Template) {
+    TemplateName Name = Context.getQualifiedTemplateName(
+        SS.getScopeRep(), /*TemplateKeyword=*/false, TemplateName(TD));
+    *Template = TemplateTy::make(Name);
+  }
   return true;
 }
 
@@ -983,10 +986,6 @@ ParsedTemplateArgument Sema::ActOnTemplateTypeArgument(TypeResult ParsedType) {
 
     if (auto DTST = TL.getAs<DeducedTemplateSpecializationTypeLoc>()) {
       TemplateName Name = DTST.getTypePtr()->getTemplateName();
-      if (SS.isSet())
-        Name = Context.getQualifiedTemplateName(SS.getScopeRep(),
-                                                /*HasTemplateKeyword=*/false,
-                                                Name);
       ParsedTemplateArgument Result(SS, TemplateTy::make(Name),
                                     DTST.getTemplateNameLoc());
       if (EllipsisLoc.isValid())
@@ -5621,6 +5620,15 @@ void Sema::diagnoseMissingTemplateArguments(TemplateName Name,
   }
 }
 
+void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
+                                            bool TemplateKeyword,
+                                            TemplateDecl *TD,
+                                            SourceLocation Loc) {
+  TemplateName Name = Context.getQualifiedTemplateName(
+      SS.getScopeRep(), TemplateKeyword, TemplateName(TD));
+  diagnoseMissingTemplateArguments(Name, Loc);
+}
+
 ExprResult
 Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
                              SourceLocation TemplateKWLoc,
@@ -5691,7 +5699,8 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
   // Non-function templates require a template argument list.
   if (auto *TD = R.getAsSingle<TemplateDecl>()) {
     if (!TemplateArgs && !isa<FunctionTemplateDecl>(TD)) {
-      diagnoseMissingTemplateArguments(TemplateName(TD), R.getNameLoc());
+      diagnoseMissingTemplateArguments(
+          SS, /*TemplateKeyword=*/TemplateKWLoc.isValid(), TD, R.getNameLoc());
       return ExprError();
     }
   }
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index f9ec34163e656b..440b8bc60eaab6 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -589,7 +589,6 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     // arguments as defaults.
     if (auto *TempArg = dyn_cast_or_null<TemplateTemplateParmDecl>(
             Arg.getAsTemplateDecl())) {
-      assert(Arg.getKind() == TemplateName::Template);
       assert(!TempArg->isExpandedParameterPack());
 
       TemplateParameterList *As = TempArg->getTemplateParameters();
@@ -658,6 +657,18 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 /// \returns the result of template argument deduction so far. Note that a
 /// "success" result means that template argument deduction has not yet failed,
 /// but it may still fail, later, for other reasons.
+
+static const TemplateSpecializationType *getLastTemplateSpecType(QualType QT) {
+  for (const Type *T = QT.getTypePtr(); /**/; /**/) {
+    const TemplateSpecializationType *TST =
+        T->getAs<TemplateSpecializationType>();
+    assert(TST && "Expected a TemplateSpecializationType");
+    if (!TST->isSugared())
+      return TST;
+    T = TST->desugar().getTypePtr();
+  }
+}
+
 static TemplateDeductionResult
 DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
                             const QualType P, QualType A,
@@ -666,26 +677,35 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
   QualType UP = P;
   if (const auto *IP = P->getAs<InjectedClassNameType>())
     UP = IP->getInjectedSpecializationType();
-  // FIXME: Try to preserve type sugar here, which is hard
-  // because of the unresolved template arguments.
-  const auto *TP = UP.getCanonicalType()->castAs<TemplateSpecializationType>();
+
+  assert(isa<TemplateSpecializationType>(UP.getCanonicalType()));
+  const TemplateSpecializationType *TP = ::getLastTemplateSpecType(UP);
   TemplateName TNP = TP->getTemplateName();
 
   // If the parameter is an alias template, there is nothing to deduce.
   if (const auto *TD = TNP.getAsTemplateDecl(); TD && TD->isTypeAlias())
     return TemplateDeductionResult::Success;
 
-  ArrayRef<TemplateArgument> PResolved = TP->template_arguments();
+  // FIXME: To preserve sugar, the TST needs to carry sugared resolved
+  // arguments.
+  ArrayRef<TemplateArgument> PResolved =
+      TP->getCanonicalTypeInternal()
+          ->castAs<TemplateSpecializationType>()
+          ->template_arguments();
 
   QualType UA = A;
+  std::optional<NestedNameSpecifier *> NNS;
   // Treat an injected-class-name as its underlying template-id.
-  if (const auto *Injected = A->getAs<InjectedClassNameType>())
+  if (const auto *Elaborated = A->getAs<ElaboratedType>()) {
+    NNS = Elaborated->getQualifier();
+  } else if (const auto *Injected = A->getAs<InjectedClassNameType>()) {
     UA = Injected->getInjectedSpecializationType();
+    NNS = nullptr;
+  }
 
   // Check whether the template argument is a dependent template-id.
-  // FIXME: Should not lose sugar here.
-  if (const auto *SA =
-          dyn_cast<TemplateSpecializationType>(UA.getCanonicalType())) {
+  if (isa<TemplateSpecializationType>(UA.getCanonicalType())) {
+    const TemplateSpecializationType *SA = ::getLastTemplateSpecType(UA);
     TemplateName TNA = SA->getTemplateName();
 
     // If the argument is an alias template, there is nothing to deduce.
@@ -698,11 +718,19 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
                                     SA->template_arguments(), Deduced);
         Result != TemplateDeductionResult::Success)
       return Result;
+
+    // FIXME: To preserve sugar, the TST needs to carry sugared resolved
+    // arguments.
+    ArrayRef<TemplateArgument> AResolved =
+        SA->getCanonicalTypeInternal()
+            ->castAs<TemplateSpecializationType>()
+            ->template_arguments();
+
     // Perform template argument deduction on each template
     // argument. Ignore any missing/extra arguments, since they could be
     // filled in by default arguments.
-    return DeduceTemplateArguments(S, TemplateParams, PResolved,
-                                   SA->template_arguments(), Info, Deduced,
+    return DeduceTemplateArguments(S, TemplateParams, PResolved, AResolved,
+                                   Info, Deduced,
                                    /*NumberOfArgumentsMustMatch=*/false);
   }
 
@@ -718,11 +746,15 @@ DeduceTemplateSpecArguments(Sema &S, TemplateParameterList *TemplateParams,
     return TemplateDeductionResult::NonDeducedMismatch;
   }
 
+  TemplateName TNA = TemplateName(SA->getSpecializedTemplate());
+  if (NNS)
+    TNA = S.Context.getQualifiedTemplateName(
+        *NNS, false, TemplateName(SA->getSpecializedTemplate()));
+
   // Perform template argument deduction for the template name.
-  if (auto Result = DeduceTemplateArguments(
-          S, TemplateParams, TP->getTemplateName(),
-          TemplateName(SA->getSpecializedTemplate()), Info,
-          SA->getTemplateArgs().asArray(), Deduced);
+  if (auto Result =
+          DeduceTemplateArguments(S, TemplateParams, TNP, TNA, Info,
+                                  SA->getTemplateArgs().asArray(), Deduced);
       Result != TemplateDeductionResult::Success)
     return Result;
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index ef0b6b701a52c5..7cec82c7010280 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -6005,12 +6005,16 @@ namespace {
       DeclarationNameInfo DNI = DeclarationNameInfo(
           TL.getTypePtr()->getTypeConstraintConcept()->getDeclName(),
           TemplateId->TemplateNameLoc);
-      auto TN = TemplateId->Template.get();
+
+      NamedDecl *FoundDecl;
+      if (auto TN = TemplateId->Template.get();
+          UsingShadowDecl *USD = TN.getAsUsingShadowDecl())
+        FoundDecl = cast<NamedDecl>(USD);
+      else
+        FoundDecl = cast_if_present<NamedDecl>(TN.getAsTemplateDecl());
+
       auto *CR = ConceptReference::Create(
-          Context, NNS, TemplateId->TemplateKWLoc, DNI,
-          /*FoundDecl=*/TN.getKind() == TemplateName::NameKind::UsingTemplate
-              ? cast<NamedDecl>(TN.getAsUsingShadowDecl())
-              : cast_if_present<NamedDecl>(TN.getAsTemplateDecl()),
+          Context, NNS, TemplateId->TemplateKWLoc, DNI, FoundDecl,
           /*NamedDecl=*/TL.getTypePtr()->getTypeConstraintConcept(),
           ASTTemplateArgumentListInfo::Create(Context, TemplateArgsInfo));
       TL.setConceptReference(CR);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 765e6177d202d1..efba99b85b0fb1 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4605,6 +4605,7 @@ TreeTransform<Derived>::TransformTemplateName(CXXScopeSpec &SS,
                                             ObjectType, AllowInjectedClassName);
   }
 
+  // FIXME: Try to preserve more of the TemplateName.
   if (TemplateDecl *Template = Name.getAsTemplateDecl()) {
     TemplateDecl *TransTemplate
       = cast_or_null<TemplateDecl>(getDerived().TransformDecl(NameLoc,
@@ -4612,11 +4613,8 @@ TreeTransform<Derived>::TransformTemplateName(CXXScopeSpec &SS,
     if (!TransTemplate)
       return TemplateName();
 
-    if (!getDerived().AlwaysRebuild() &&
-        TransTemplate == Template)
-      return Name;
-
-    return TemplateName(TransTemplate);
+    return getDerived().RebuildTemplateName(SS, /*TemplateKeyword=*/false,
+                                            TransTemplate);
   }
 
   if (SubstTemplateTemplateParmPackStorage *SubstPack
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index 9382558393e4c0..08a3be5c6b754e 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -29,17 +29,17 @@ Out2<double>::AInner t(1.0);
 // CHECK:      |   `-FunctionTemplateDecl {{.*}} <deduction guide for AInner>
 // CHECK-NEXT: |     |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 Y
 // CHECK-NEXT: |     |-BinaryOperator {{.*}} '<dependent type>' '&&'
-// CHECK-NEXT: |     | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept' 
+// CHECK-NEXT: |     | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
 // CHECK-NEXT: |     | | |-TemplateArgument type 'int'
 // CHECK-NEXT: |     | | | `-BuiltinType {{.*}} 'int'
 // CHECK-NEXT: |     | | `-TemplateArgument type 'type-parameter-1-0'
 // CHECK-NEXT: |     | |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
 // CHECK-NEXT: |     | `-TypeTraitExpr {{.*}} 'bool' __is_deducible
-// CHECK-NEXT: |     |   |-DeducedTemplateSpecializationType {{.*}} 'AInner' dependent
+// CHECK-NEXT: |     |   |-DeducedTemplateSpecializationType {{.*}} 'Out2<double>::AInner' dependent
 // CHECK-NEXT: |     |   `-ElaboratedType {{.*}} 'Inner<type-parameter-1-0>' sugar dependent
 // CHECK-NEXT: |     |     `-TemplateSpecializationType {{.*}} 'Inner<type-parameter-1-0>' dependent Inner
 // CHECK-NEXT: |     |       `-TemplateArgument type 'type-parameter-1-0'
-// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'type-parameter-1-0' 
+// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'type-parameter-1-0'
 // CHECK-NEXT: |     |           |-FunctionTemplate {{.*}} '<deduction guide for Inner>'
 // CHECK-NEXT: |     |           `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
 // CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (type-parameter-0-0) -> Inner<type-parameter-0-0>'
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index b861ba8be15b50..e84241cee922f5 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -466,14 +466,14 @@ namespace testClassTemplateDecl {
 // CHECK:       ClassTemplateDecl 0x{{.+}} <{{.+}}:{{.*}}:3, col:68> col:68 TestTemplateTemplateDefaultType{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl 0x{{.+}} <col:12, col:42> col:37 depth 0 index 0 TT{{$}}
 // CHECK-NEXT:  | |-TemplateTypeParmDecl 0x{{.+}} <col:21> col:29 typename depth 1 index 0{{$}}
-// CHECK-NEXT:  | `-TemplateArgument <col:42> template 'testClassTemplateDecl::TestClassTemplate'{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <col:42> template 'TestClassTemplate':'testClassTemplateDecl::TestClassTemplate' qualified{{$}}
 // CHECK-NEXT:  |   `-ClassTemplateDecl 0x{{.+}} <line:{{.+}}:3, line:{{.+}}:3> line:{{.+}}:30 TestClassTemplate{{$}}
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <line:{{.*}}:61, col:68> col:68 struct TestTemplateTemplateDefaultType{{$}}
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:{{.*}}:3, col:82> col:48 TestTemplateTemplateDefaultType{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl 0x{{.+}} <col:12, col:37> col:37 depth 0 index 0 TT{{$}}
 // CHECK-NEXT:  | |-TemplateTypeParmDecl 0x{{.+}} <col:21> col:29 typename depth 1 index 0{{$}}
-// CHECK-NEXT:  | `-TemplateArgument <line:{{.*}}:42> template 'testClassTemplateDecl::TestClassTemplate'{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <line:{{.*}}:42> template 'TestClassTemplate':'testClassTemplateDecl::TestClassTemplate' qualified{{$}}
 // CHECK-NEXT:  |   |-inherited from TemplateTemplateParm 0x{{.+}} 'TT'{{$}}
 // CHECK-NEXT:  |   `-ClassTemplateDecl 0x{{.+}} <line:{{.+}}:3, line:{{.+}}:3> line:{{.+}}:30 TestClassTemplate
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} prev 0x{{.+}} <line:{{.*}}:41, col:82> col:48 struct TestTemplateTemplateDefaultType definition{{$}}
@@ -685,7 +685,7 @@ namespace TestTemplateTemplateParmDecl {
 // CHECK:        FunctionTemplateDecl
 // CHECK-NEXT:     TemplateTemplateParmDecl{{.*}} T
 // CHECK-NEXT:       TemplateTypeParmDecl{{.*}} typename
-// CHECK-NEXT:       TemplateArgument{{.*}} template 'TestTemplateTemplateParmDecl::A'
+// CHECK-NEXT:       TemplateArgument{{.*}} template 'A':'TestTemplateTemplateParmDecl::A' qualified{{$}}
 // CHECK-NEXT:         ClassTemplateDecl {{.*}} A
 // CHECK-NEXT:     TemplateTemplateParmDecl{{.*}} ... U
 // CHECK-NEXT:       TemplateTypeParmDecl{{.*}} typename
@@ -718,7 +718,7 @@ namespace TestTemplateArgument {
   template<template<typename> class> class testTemplate { };
   template class testTemplate<A>;
   // CHECK:      ClassTemplateSpecializationDecl{{.*}} class testTemplate
-  // CHECK:        TemplateArgument{{.*}} 'TestTemplateArgument::A'
+  // CHECK:        TemplateArgument{{.*}} 'TestTemplateArgument::A'{{$}}
 
   template<template<typename> class ...T> class C {
     B<T...> testTemplateExpansion;
diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp
index f9e9ee9d35dde6..5da025c229ea3d 100644
--- a/clang/test/AST/ast-dump-expr.cpp
+++ b/clang/test/AST/ast-dump-expr.cpp
@@ -233,7 +233,7 @@ void PostfixExpressions(S a, S *p, U<int> *r) {
   r->template U<int>::~U();
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:26> 'void'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:24> '<bound member function type>' ->~U 0x{{[^ ]*}}
-  // CHECK-NEXT: NestedNameSpecifier TypeSpecWithTemplate 'U<int>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpecWithTemplate 'template U<int>':'U<int>'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'U<int> *' lvalue ParmVar 0x{{[^ ]*}} 'r' 'U<int> *'
 
diff --git a/clang/test/AST/ast-dump-template-decls.cpp b/clang/test/AST/ast-dump-template-decls.cpp
index 37f6d8a0472d30..55bded4c77d4ba 100644
--- a/clang/test/AST/ast-dump-template-decls.cpp
+++ b/clang/test/AST/ast-dump-template-decls.cpp
@@ -116,7 +116,7 @@ template <class T> struct C {
 using type2 = typename C<int>::type1<void>;
 // CHECK:      TypeAliasDecl 0x{{[^ ]*}} <line:[[@LINE-1]]:1, col:42> col:7 type2 'typename C<int>::type1<void>':'void (int)'
 // CHECK-NEXT: ElaboratedType 0x{{[^ ]*}} 'typename C<int>::type1<void>' sugar
-// CHECK-NEXT: TemplateSpecializationType 0x{{[^ ]*}} 'type1<void>' sugar alias type1
+// CHECK-NEXT: TemplateSpecializationType 0x{{[^ ]*}} 'type1<void>' sugar alias C<int>::type1
 // CHECK-NEXT: TemplateArgument type 'void'
 // CHECK-NEXT: BuiltinType 0x{{[^ ]*}} 'void'
 // CHECK-NEXT: FunctionProtoType 0x{{[^ ]*}} 'void (int)' cdecl
@@ -149,7 +149,7 @@ template <typename... T> struct D {
   template <typename... U> using B = int(int (*...p)(T, U));
 };
 using t2 = D<float, char>::B<int, short>;
-// CHECK:      TemplateSpecializationType 0x{{[^ ]*}} 'B<int, short>' sugar alias B
+// CHECK:      TemplateSpecializationType 0x{{[^ ]*}} 'B<int, short>' sugar alias D<float, char>::B{{$}}
 // CHECK:      FunctionProtoType 0x{{[^ ]*}} 'int (int (*)(float, int), int (*)(char, short))' cdecl
 // CHECK:      FunctionProtoType 0x{{[^ ]*}} 'int (float, int)' cdecl
 // CHECK:      SubstTemplateTypeParmType 0x{{[^ ]*}} 'float' sugar typename depth 0 index 0 ... T pack_index 1
@@ -169,7 +169,7 @@ template<template<class C1, class C2 = A<C1>> class D1, class D2> using D = D1<D
 
 template<class E1, class E2> class E {};
 using test1 = D<E, int>;
-// CHECK:      TypeAliasDecl 0x{{[^ ]*}} <line:{{[1-9]+}}:1, col:23> col:7 test1 'D<subst_default_argument::E, int>':'subst_default_argument::E<int, subst_default_argument::A<int>>'
+// CHECK:      TypeAliasDecl 0x{{[^ ]*}} <line:{{[1-9]+}}:1, col:23> col:7 test1 'D<E, int>':'subst_default_argument::E<int, subst_default_argument::A<int>>'
 // CHECK:      TemplateSpecializationType 0x{{[^ ]*}} 'A<int>' sugar A
 // CHECK-NEXT: |-TemplateArgument type 'int'
 // CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar class depth 0 index 1 D2
diff --git a/clang/test/AST/ast-dump-template-name.cpp b/clang/test/AST/ast-dump-template-name.cpp
index 39100711b60a13..7972e9f9e9b065 100644
--- a/clang/test/AST/ast-dump-template-name.cpp
+++ b/clang/test/AST/ast-dump-template-name.cpp
@@ -13,7 +13,7 @@ namespace qualified {
 // CHECK-NEXT: TypeAliasDecl
 // CHECK-NEXT: `-ElaboratedType
 // CHECK-NEXT:   `-TemplateSpecializationType
-// CHECK-NEXT:     |-TemplateArgument template 'qualified::foo::A' qualified{{$}}
+// CHECK-NEXT:     |-TemplateArgument template 'foo::A':'qualified::foo::A' qualified{{$}}
 // CHECK-NEXT:     | |-NestedNameSpecifier Namespace 0x{{.+}} 'foo'{{$}}
 // CHECK-NEXT:     | `-ClassTemplateDecl {{.+}} A{{$}}
 
@@ -27,7 +27,7 @@ namespace dependent {
 // CHECK-NEXT: TypeAliasDecl
 // CHECK-NEXT: `-ElaboratedType
 // CHECK-NEXT:   `-TemplateSpecializationType
-// CHECK-NEXT:     |-TemplateArgument template 'template X' dependent{{$}}
+// CHECK-NEXT:     |-TemplateArgument template 'T::template X':'type-parameter-0-0::template X' dependent{{$}}
 // CHECK-NEXT:     | `-NestedNameSpecifier TypeSpec 'T'{{$}}
 
 namespace subst {
diff --git a/clang/test/AST/ast-dump-using-template.cpp b/clang/test/AST/ast-dump-using-template.cpp
index 69b199fd0606c1..7731c2ad0231b0 100644
--- a/clang/test/AST/ast-dump-using-template.cpp
+++ b/clang/test/AST/ast-dump-using-template.cpp
@@ -26,9 +26,9 @@ using A = S<T>;
 template <template <typename> class T> class X {};
 using B = X<S>;
 // CHECK:      TypeAliasDecl
-// CHECK-NEXT: `-ElaboratedType {{.*}} 'X<ns::S>' sugar
-// CHECK-NEXT:   `-TemplateSpecializationType {{.*}} 'X<ns::S>' sugar X
-// CHECK-NEXT:     |-TemplateArgument template 'ns::S'
+// CHECK-NEXT: `-ElaboratedType {{.*}} 'X<S>' sugar
+// CHECK-NEXT:   `-TemplateSpecializationType {{.*}} 'X<S>' sugar X
+// CHECK-NEXT:     |-TemplateArgument template 'S'
 // CHECK-NEXT:     | |-UsingShadowDecl {{.*}} implicit ClassTemplate {{.*}} 'S'
 // CHECK-NEXT:     | `-target: ClassTemplateDecl {{.*}} S
 // CHECK-NEXT:     `-RecordType {{.*}} 'X<ns::S>'
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index a8f9b705a98660..6bc63760f8333f 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -518,7 +518,7 @@ namespace cwg136 { // cwg136: 3.4
   void q() {
     j(A(), A()); // ok, has default argument
   }
-  extern "C" void k(int, int, int, int); // #cwg136-k 
+  extern "C" void k(int, int, int, int); // #cwg136-k
   namespace NSA {
   struct A {
     friend void cwg136::k(int, int, int, int = 0);
@@ -1048,7 +1048,7 @@ namespace cwg176 { // cwg176: 3.1
     cwg176::X *p4; // #cwg176-p4
     // cxx98-14-error@#cwg176-p4 {{use of class template 'cwg176::X' requires template arguments}}
     //  cxx98-14-note@#cwg176-X {{template is declared here}}
-    // since-cxx17-error@#cwg176-p4 {{use of class template 'X' requires template arguments; argument deduction not allowed in non-static class member}}
+    // since-cxx17-error@#cwg176-p4 {{use of class template 'cwg176::X' requires template arguments; argument deduction not allowed in non-static class member}}
     //  since-cxx17-note@#cwg176-X {{template is declared here}}
   };
 }
diff --git a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
index 54dabb4be2c055..a574d31a0925ad 100644
--- a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
+++ b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
@@ -315,7 +315,7 @@ int a1 = 0 == A<1>(); // Should not find 2 as the requires clause does not match
 
 namespace static_operators {
 // Verify no crash.
-struct X { 
+struct X {
   bool operator ==(X const&); // expected-note {{ambiguity is between a regular call}}
                               // expected-note@-1 {{mark 'operator==' as const or add a matching 'operator!=' to resolve the ambiguity}}
   static bool operator !=(X const&, X const&); // expected-error {{overloaded 'operator!=' cannot be a static member function}}
@@ -474,7 +474,7 @@ namespace ns {
 template <class T> struct A {};
 template <class T> struct B : A<T> {};
 
-template <class T> bool operator==(B<T>, A<T>); // expected-note {{candidate template ignored: could not match 'B' against 'A'}}
+template <class T> bool operator==(B<T>, A<T>); // expected-note {{candidate template ignored: could not match 'B' against 'ns::A'}}
 template <class T> bool operator!=(B<T>, A<T>);
 }
 
diff --git a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.type/p9-0x.cpp b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.type/p9-0x.cpp
index 8f135b72546fff..51df1e0b14541b 100644
--- a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.type/p9-0x.cpp
+++ b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.type/p9-0x.cpp
@@ -3,7 +3,7 @@
 template<typename ...Types> struct tuple;
 template<unsigned> struct unsigned_c;
 
-template<typename T, typename U> 
+template<typename T, typename U>
 struct is_same {
   static const bool value = false;
 };
@@ -93,7 +93,7 @@ namespace DeduceNonTypeTemplateArgsInArray {
 }
 
 namespace DeduceWithDefaultArgs {
-  template<template<typename...> class Container> void f(Container<int>); // expected-note {{deduced type 'X<[...], (default) int>' of 1st parameter does not match adjusted type 'X<[...], double>' of argument [with Container = DeduceWithDefaultArgs::X]}}
+  template<template<typename...> class Container> void f(Container<int>); // expected-note {{deduced type 'X<[...], (default) int>' of 1st parameter does not match adjusted type 'X<[...], double>' of argument [with Container = X]}}
   template<typename, typename = int> struct X {};
   void g() {
     // OK, use default argument for the second template parameter.
diff --git a/clang/test/Index/print-type.cpp b/clang/test/Index/print-type.cpp
index db8559521e29d4..8c3d4c254964a2 100644
--- a/clang/test/Index/print-type.cpp
+++ b/clang/test/Index/print-type.cpp
@@ -132,7 +132,7 @@ inline namespace InlineNS {}
 // CHECK: TypedefDecl=OtherType:26:18 (Definition) [type=outer::inner::Bar::OtherType] [typekind=Typedef] [canonicaltype=double] [canonicaltypekind=Double] [isPOD=1]
 // CHECK: TypedefDecl=ArrayType:27:15 (Definition) [type=outer::inner::Bar::ArrayType] [typekind=Typedef] [canonicaltype=int[5]] [canonicaltypekind=ConstantArray] [isPOD=1]
 // CHECK: IntegerLiteral= [type=int] [typekind=Int] [isPOD=1]
-// CHECK: FieldDecl=baz:28:20 (Definition) [type=Baz<int, 1, outer::Foo>] [typekind=Elaborated] [templateargs/3= [type=int] [typekind=Int]] [canonicaltype=outer::Baz<int, 1, outer::Foo>] [canonicaltypekind=Record] [canonicaltemplateargs/3= [type=int] [typekind=Int]] [isPOD=1]
+// CHECK: FieldDecl=baz:28:20 (Definition) [type=Baz<int, 1, Foo>] [typekind=Elaborated] [templateargs/3= [type=int] [typekind=Int]] [canonicaltype=outer::Baz<int, 1, outer::Foo>] [canonicaltypekind=Record] [canonicaltemplateargs/3= [type=int] [typekind=Int]] [isPOD=1]
 // CHECK: TemplateRef=Baz:9:8 [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: IntegerLiteral= [type=int] [typekind=Int] [isPOD=1]
 // CHECK: TemplateRef=Foo:4:8 [type=] [typekind=Invalid] [isPOD=0]
diff --git a/clang/test/OpenMP/declare_mapper_messages.cpp b/clang/test/OpenMP/declare_mapper_messages.cpp
index 95861612a076bd..f2101786f6ce02 100644
--- a/clang/test/OpenMP/declare_mapper_messages.cpp
+++ b/clang/test/OpenMP/declare_mapper_messages.cpp
@@ -46,7 +46,7 @@ class stack {                                                           // expec
 };
 
 #pragma omp declare mapper(default : N1::stack s) map(s.len)            // precxx17-error {{use of class template 'N1::stack' requires template arguments}} \
-                                                                           cxx17-error {{use of class template 'stack' requires template arguments; argument deduction not allowed in function prototype}}
+                                                                           cxx17-error {{use of class template 'N1::stack' requires template arguments; argument deduction not allowed in function prototype}}
 #pragma omp declare mapper(id1: N1::stack<int> s) map(s.data)
 #pragma omp declare mapper(default : S<int> s) map(s.len)               // expected-error {{no template named 'S'}}
 
diff --git a/clang/test/Parser/cxx-template-template-recovery.cpp b/clang/test/Parser/cxx-template-template-recovery.cpp
index 1230c86d924ff8..5700b160cd3640 100644
--- a/clang/test/Parser/cxx-template-template-recovery.cpp
+++ b/clang/test/Parser/cxx-template-template-recovery.cpp
@@ -29,9 +29,9 @@ static_assert(test<a::b::C2>); // expected-error {{too few template arguments fo
 static_assert(test<C3>); // expected-error {{too few template arguments for concept 'C3'}} \
                          // expected-note@#C3 {{here}}
 
-static_assert(test<a::V1>); // expected-error {{use of variable template 'V1' requires template arguments}} \
+static_assert(test<a::V1>); // expected-error {{use of variable template 'a::V1' requires template arguments}} \
                             // expected-note@#V1 {{here}}
-static_assert(test<a::b::V2>); // expected-error {{use of variable template 'V2' requires template arguments}} \
+static_assert(test<a::b::V2>); // expected-error {{use of variable template 'a::b::V2' requires template arguments}} \
                             // expected-note@#V2 {{here}}
 static_assert(test<V3>); // expected-error {{use of variable template 'V3' requires template arguments}} \
                          // expected-note@#V3 {{here}}
diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index af121a8b75d512..f42c812a860d04 100644
--- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -352,7 +352,7 @@ namespace ns2 {
   };
   template<class T> template<class U, T N, U M> T&& A<T>::Var = T(N + M);
   int *AV = &A<int>().Var<char, 5, 'A'>;
-  
+
 } //end ns2
 } // end ns member_access_is_ok
 
@@ -372,7 +372,7 @@ struct Something
     }
 };
 
-int main() { 
+int main() {
     Something<Value>{}.foo();
     return 0;
 }
@@ -384,16 +384,16 @@ namespace dependent_static_var_template {
   struct A {
     template<int = 0> static int n; // expected-note 2{{here}}
   };
-  int &r = A::template n; // expected-error {{use of variable template 'n' requires template arguments}}
+  int &r = A::template n; // expected-error {{use of variable template 'A::template n' requires template arguments}}
 
   template<typename T>
-  int &f() { return T::template n; } // expected-error {{use of variable template 'n' requires template arguments}}
+  int &f() { return T::template n; } // expected-error {{use of variable template 'A::template n' requires template arguments}}
   int &s = f<A>(); // expected-note {{instantiation of}}
 
   namespace B {
     template<int = 0> static int n; // expected-note {{here}}
   }
-  int &t = B::template n; // expected-error {{use of variable template 'n' requires template arguments}}
+  int &t = B::template n; // expected-error {{use of variable template 'B::template n' requires template arguments}}
 
   struct C {
     template <class T> static T G;
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index e3b5e575374d3f..45e74cce3a98c8 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -4,7 +4,7 @@
 namespace issue1 {
   template<class T, class U = T> class B {};
   template<template<class> class P, class T> void f(P<T>);
-  // new-note@-1 {{deduced type 'B<[...], (default) int>' of 1st parameter does not match adjusted type 'B<[...], float>' of argument [with P = issue1::B, T = int]}}
+  // new-note@-1 {{deduced type 'B<[...], (default) int>' of 1st parameter does not match adjusted type 'B<[...], float>' of argument [with P = B, T = int]}}
   // old-note@-2 2{{template template argument has different template parameters}}
 
   void g() {
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index ba82fc1313fc95..516708bf4c875a 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -72,8 +72,8 @@ namespace type_requirement {
 
   template<typename T> requires
   false_v<requires { typename T::template temp<T>; }>
-  // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::temp<contains_template<int> >; }>' evaluated to false}}
-  // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::temp<contains_template<short> >; }>' evaluated to false}}
+  // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int> >; }>' evaluated to false}}
+  // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short> >; }>' evaluated to false}}
   struct r2 {};
 
   using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
index f289dc0452868b..a4ae046ac52741 100644
--- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
+++ b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
@@ -79,7 +79,7 @@ nested_init_list<int>::B nil {1, 2};
 using NIL = decltype(nil);
 using NIL = nested_init_list<int>::B<int>;
 
-// expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'concept_fail'}}
+// expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'nested_init_list<int>::concept_fail'}}
 nested_init_list<int>::concept_fail nil_invalid{1, ""};
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: substitution failure [with F = const char *]: constraints not satisfied for class template 'concept_fail' [with F = const char *]}}
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 1 argument, but 2 were provided}}
diff --git a/clang/unittests/AST/TemplateNameTest.cpp b/clang/unittests/AST/TemplateNameTest.cpp
index fb9061053ea517..444ccfb5c9c811 100644
--- a/clang/unittests/AST/TemplateNameTest.cpp
+++ b/clang/unittests/AST/TemplateNameTest.cpp
@@ -24,6 +24,31 @@ std::string printTemplateName(TemplateName TN, const PrintingPolicy &Policy,
   return Out.str();
 }
 
+TEST(TemplateName, PrintTemplate) {
+  std::string Code = R"cpp(
+    namespace std {
+      template <typename> struct vector {};
+    }
+    template<template <typename> class T> class X;
+    using A = X<std::vector>;
+  )cpp";
+  auto AST = tooling::buildASTFromCode(Code);
+  ASTContext &Ctx = AST->getASTContext();
+  // Match the template argument vector in X<std::vector>.
+  auto MatchResults = match(templateArgumentLoc().bind("id"), Ctx);
+  const auto *Template = selectFirst<TemplateArgumentLoc>("id", MatchResults);
+  ASSERT_TRUE(Template);
+
+  TemplateName TN = Template->getArgument().getAsTemplate();
+  EXPECT_EQ(TN.getKind(), TemplateName::QualifiedTemplate);
+  EXPECT_EQ(printTemplateName(TN, Ctx.getPrintingPolicy(),
+                              TemplateName::Qualified::AsWritten),
+            "std::vector");
+  EXPECT_EQ(printTemplateName(TN, Ctx.getPrintingPolicy(),
+                              TemplateName::Qualified::None),
+            "vector");
+}
+
 TEST(TemplateName, PrintUsingTemplate) {
   std::string Code = R"cpp(
     namespace std {
@@ -44,12 +69,11 @@ TEST(TemplateName, PrintUsingTemplate) {
   ASSERT_TRUE(Template);
 
   TemplateName TN = Template->getArgument().getAsTemplate();
-  EXPECT_EQ(TN.getKind(), TemplateName::UsingTemplate);
-  EXPECT_EQ(TN.getAsUsingShadowDecl()->getTargetDecl(), TN.getAsTemplateDecl());
+  EXPECT_EQ(TN.getKind(), TemplateName::QualifiedTemplate);
+  UsingShadowDecl *USD = TN.getAsUsingShadowDecl();
+  EXPECT_TRUE(USD != nullptr);
+  EXPECT_EQ(USD->getTargetDecl(), TN.getAsTemplateDecl());
 
-  EXPECT_EQ(printTemplateName(TN, Ctx.getPrintingPolicy(),
-                              TemplateName::Qualified::Fully),
-            "std::vector");
   EXPECT_EQ(printTemplateName(TN, Ctx.getPrintingPolicy(),
                               TemplateName::Qualified::AsWritten),
             "vector");
@@ -102,7 +126,8 @@ TEST(TemplateName, UsingTemplate) {
   const auto *TST =
       MatchResults.front().getNodeAs<TemplateSpecializationType>("id");
   ASSERT_TRUE(TST);
-  EXPECT_EQ(TST->getTemplateName().getKind(), TemplateName::UsingTemplate);
+  EXPECT_EQ(TST->getTemplateName().getKind(), TemplateName::QualifiedTemplate);
+  EXPECT_TRUE(TST->getTemplateName().getAsUsingShadowDecl() != nullptr);
 
   AST = tooling::buildASTFromCodeWithArgs(R"cpp(
     namespace std {
@@ -120,7 +145,8 @@ TEST(TemplateName, UsingTemplate) {
   const auto *DTST =
       MatchResults.front().getNodeAs<DeducedTemplateSpecializationType>("id");
   ASSERT_TRUE(DTST);
-  EXPECT_EQ(DTST->getTemplateName().getKind(), TemplateName::UsingTemplate);
+  EXPECT_EQ(DTST->getTemplateName().getKind(), TemplateName::QualifiedTemplate);
+  EXPECT_TRUE(DTST->getTemplateName().getAsUsingShadowDecl() != nullptr);
 }
 
 } // namespace
diff --git a/libcxx/test/std/containers/associative/map/map.cons/deduct.verify.cpp b/libcxx/test/std/containers/associative/map/map.cons/deduct.verify.cpp
index 70e200cda324f0..b314e6fba69644 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/deduct.verify.cpp
@@ -42,63 +42,63 @@ int main(int, char**)
 {
     {
         // cannot deduce Key and T from nothing
-        std::map m; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+        std::map m; // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce Key and T from just (Compare)
         std::map m(std::less<int>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce Key and T from just (Compare, Allocator)
         std::map m(std::less<int>{}, std::allocator<PC>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce Key and T from just (Allocator)
         std::map m(std::allocator<PC>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // refuse to rebind the allocator if Allocator::value_type is not exactly what we expect
         const P arr[] = { {1,1L}, {2,2L}, {3,3L} };
         std::map m(arr, arr + 3, std::allocator<P>());
-            // expected-error-re@map:* {{static assertion failed{{( due to requirement '.*')?}}{{.*}}Allocator::value_type must be same type as value_type}}
+            // expected-error-re@map:*{{static assertion failed{{( due to requirement '.*')?}}{{.*}}Allocator::value_type must be same type as value_type}}
     }
     {
         // cannot convert from some arbitrary unrelated type
         NotAnAllocator a;
-        std::map m(a); // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+        std::map m(a); // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::map m{ {1,1L}, {2,2L}, {3,3L} };
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::map m({ {1,1L}, {2,2L}, {3,3L} }, std::less<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::map m({ {1,1L}, {2,2L}, {3,3L} }, std::less<int>(), std::allocator<PC>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::map m({ {1,1L}, {2,2L}, {3,3L} }, std::allocator<PC>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // since we have parens, not braces, this deliberately does not find the initializer_list constructor
         std::map m(P{1,1L});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
     {
         // since we have parens, not braces, this deliberately does not find the initializer_list constructor
         std::map m(PC{1,1L});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}map'}}
     }
 
     return 0;
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.verify.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.verify.cpp
index 1fda02638ef652..795ac192400317 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.verify.cpp
@@ -42,22 +42,22 @@ int main(int, char**)
 {
     {
         // cannot deduce Key and T from nothing
-        std::multimap m; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+        std::multimap m; // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce Key and T from just (Compare)
         std::multimap m(std::less<int>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce Key and T from just (Compare, Allocator)
         std::multimap m(std::less<int>{}, std::allocator<PC>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce Key and T from just (Allocator)
         std::multimap m(std::allocator<PC>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // refuse to rebind the allocator if Allocator::value_type is not exactly what we expect
@@ -68,37 +68,37 @@ int main(int, char**)
     {
         // cannot convert from some arbitrary unrelated type
         NotAnAllocator a;
-        std::multimap m(a); // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+        std::multimap m(a); // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::multimap m{ {1,1L}, {2,2L}, {3,3L} };
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::multimap m({ {1,1L}, {2,2L}, {3,3L} }, std::less<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::multimap m({ {1,1L}, {2,2L}, {3,3L} }, std::less<int>(), std::allocator<PC>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // cannot deduce that the inner braced things should be std::pair and not something else
         std::multimap m({ {1,1L}, {2,2L}, {3,3L} }, std::allocator<PC>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // since we have parens, not braces, this deliberately does not find the initializer_list constructor
         std::multimap m(P{1,1L});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
     {
         // since we have parens, not braces, this deliberately does not find the initializer_list constructor
         std::multimap m(PC{1,1L});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multimap'}}
     }
 
     return 0;
diff --git a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.verify.cpp b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.verify.cpp
index 48412e9f4c43b3..30dd08b0481557 100644
--- a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.verify.cpp
@@ -40,29 +40,29 @@ int main(int, char **) {
   {
     // cannot deduce Key from nothing
     std::multiset s;
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multiset'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multiset'}}
   }
   {
     // cannot deduce Key from just (Compare)
     std::multiset s(std::less<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multiset'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multiset'}}
   }
   {
     // cannot deduce Key from just (Compare, Allocator)
     std::multiset s(std::less<int>{}, std::allocator<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multiset'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multiset'}}
   }
   {
     // cannot deduce Key from multiset(Allocator)
     std::multiset s(std::allocator<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multiset'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multiset'}}
   }
   {
     // since we have parens, not braces, this deliberately does not find the
     // initializer_list constructor
     NotAnAllocator a;
     std::multiset s(a);
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'multiset'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}multiset'}}
   }
 
   return 0;
diff --git a/libcxx/test/std/containers/associative/set/set.cons/deduct.verify.cpp b/libcxx/test/std/containers/associative/set/set.cons/deduct.verify.cpp
index 02d2528870e54f..6e250f8fa21e68 100644
--- a/libcxx/test/std/containers/associative/set/set.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/associative/set/set.cons/deduct.verify.cpp
@@ -40,29 +40,29 @@ int main(int, char **) {
   {
     // cannot deduce Key from nothing
     std::set s;
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'set'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}set'}}
   }
   {
     // cannot deduce Key from just (Compare)
     std::set s(std::less<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'set'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}set'}}
   }
   {
     // cannot deduce Key from just (Compare, Allocator)
     std::set s(std::less<int>{}, std::allocator<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'set'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}set'}}
   }
   {
     // cannot deduce Key from just (Allocator)
     std::set s(std::allocator<int>{});
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'set'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}set'}}
   }
   {
     // since we have parens, not braces, this deliberately does not find the
     // initializer_list constructor
     NotAnAllocator a;
     std::set s(a);
-    // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'set'}}
+    // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}set'}}
   }
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.verify.cpp
index 7dd0e256d7e7a2..73487597ca56eb 100644
--- a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.verify.cpp
@@ -22,32 +22,32 @@ int main(int, char**)
     {
 //  queue(Compare, Container, const Alloc);
 //  The '45' is not an allocator
-    std::priority_queue pri(std::greater<int>(), std::deque<int>({1,2,3}), 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'priority_queue'}}
+    std::priority_queue pri(std::greater<int>(), std::deque<int>({1,2,3}), 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}priority_queue'}}
     }
 
     {
 //  queue(const queue&, const Alloc&);
 //  The '45' is not an allocator
     std::priority_queue<int> source;
-    std::priority_queue pri(source, 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'priority_queue'}}
+    std::priority_queue pri(source, 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}priority_queue'}}
     }
 
     {
 //  priority_queue(Iter, Iter, Comp)
 //  int is not an iterator
-    std::priority_queue pri(15, 17, std::greater<double>());  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'priority_queue'}}
+    std::priority_queue pri(15, 17, std::greater<double>());  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}priority_queue'}}
     }
 
     {
 //  priority_queue(Iter, Iter, Comp, Container)
 //  float is not an iterator
-    std::priority_queue pri(23.f, 2.f, std::greater<float>(), std::deque<float>());   // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'priority_queue'}}
+    std::priority_queue pri(23.f, 2.f, std::greater<float>(), std::deque<float>());   // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}priority_queue'}}
     }
 
 //  Test the implicit deduction guides
     {
 //  priority_queue (allocator &)
-    std::priority_queue pri((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'priority_queue'}}
+    std::priority_queue pri((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}priority_queue'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      stack<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.verify.cpp
index 301acca8c67c62..f85b2cbeb249cf 100644
--- a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.verify.cpp
@@ -22,20 +22,20 @@ int main(int, char**)
     {
 //  queue(const Container&, const Alloc&);
 //  The '45' is not an allocator
-    std::queue que(std::list<int>{1,2,3}, 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'queue'}}
+    std::queue que(std::list<int>{1,2,3}, 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}queue'}}
     }
 
     {
 //  queue(const queue&, const Alloc&);
 //  The '45' is not an allocator
     std::queue<int> source;
-    std::queue que(source, 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'queue'}}
+    std::queue que(source, 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}queue'}}
     }
 
 //  Test the implicit deduction guides
     {
 //  queue (allocator &)
-    std::queue que((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'queue'}}
+    std::queue que((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}queue'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      stack<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.verify.cpp
index 55296f4122335f..390df86dd0f5ab 100644
--- a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.verify.cpp
@@ -28,20 +28,20 @@ int main(int, char**)
     {
 //  stack(const Container&, const Alloc&);
 //  The '45' is not an allocator
-    std::stack stk(std::list<int>({1,2,3}), 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'stack'}}
+    std::stack stk(std::list<int>({1,2,3}), 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}stack'}}
     }
 
     {
 //  stack(const stack&, const Alloc&);
 //  The '45' is not an allocator
     std::stack<int> source;
-    std::stack stk(source, 45);  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'stack'}}
+    std::stack stk(source, 45);  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}stack'}}
     }
 
 //  Test the implicit deduction guides
     {
 //  stack (allocator &)
-    std::stack stk((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'stack'}}
+    std::stack stk((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}stack'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      stack<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/array/array.cons/deduct.verify.cpp
index 1a654e195c7b6c..f59b761fad9c9d 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/deduct.verify.cpp
@@ -24,7 +24,7 @@
 int main(int, char**)
 {
     {
-    std::array arr{1,2,3L}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'array'}}
+    std::array arr{1,2,3L}; // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}array'}}
     }
 
   return 0;
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.verify.cpp
index 044669aaec822b..f65e230112d90d 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.verify.cpp
@@ -29,7 +29,7 @@ int main(int, char**)
 //  Test the implicit deduction guides
     {
 //  deque (allocator &)
-    std::deque deq((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'deque'}}
+    std::deque deq((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}deque'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      deque<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.verify.cpp
index 47c1cdcea0e406..b3c3f73270f325 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.verify.cpp
@@ -29,7 +29,7 @@ int main(int, char**)
 //  Test the implicit deduction guides
     {
 //  forward_list (allocator &)
-    std::forward_list fwl((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'forward_list'}}
+    std::forward_list fwl((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}forward_list'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      forward_list<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/list/list.cons/deduct.verify.cpp
index 96d14514456c5a..370cd38612ab0b 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/deduct.verify.cpp
@@ -29,7 +29,7 @@ int main(int, char**)
 //  Test the implicit deduction guides
     {
 //  list (allocator &)
-    std::list lst((std::allocator<int>()));  // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'list'}}
+    std::list lst((std::allocator<int>()));  // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}list'}}
 //  Note: The extra parens are necessary, since otherwise clang decides it is a function declaration.
 //  Also, we can't use {} instead of parens, because that constructs a
 //      deque<allocator<int>, allocator<allocator<int>>>
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp
index 2b2242e240a2cd..a6fc763050ec65 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp
@@ -25,7 +25,7 @@ int main(int, char**) {
   //  Test the implicit deduction guides
   {
     //  vector (allocator &)
-    // expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'vector'}}
+    // expected-error-re@+1 {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}vector'}}
     std::vector vec(std::allocator< int>{});
   }
 
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.verify.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.verify.cpp
index aa7a0580750f70..dc0ffd26813d92 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.verify.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.verify.cpp
@@ -64,41 +64,41 @@ int main(int, char**)
     using P = std::pair<const int, int>;
     {
         // cannot deduce Key from nothing
-        std::unordered_map m; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+        std::unordered_map m; // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size)
-        std::unordered_map m(42); // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+        std::unordered_map m(42); // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size, Hash)
         std::unordered_map m(42, std::hash<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred)
         std::unordered_map m(42, std::hash<int>(), std::equal_to<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred, Allocator)
         std::unordered_map m(42, std::hash<int>(), std::equal_to<int>(), std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Allocator)
         std::unordered_map m(std::allocator<P>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size, Allocator)
         std::unordered_map m(42, std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Allocator)
         std::unordered_map m(42, std::hash<int>(), std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_map'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_map'}}
     }
 
   return 0;
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.verify.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.verify.cpp
index 5e8db678b6e20c..efcbbbce0e3027 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.verify.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.verify.cpp
@@ -64,41 +64,41 @@ int main(int, char**)
     using P = std::pair<const int, int>;
     {
         // cannot deduce Key from nothing
-        std::unordered_multimap m; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+        std::unordered_multimap m; // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size)
-        std::unordered_multimap m(42); // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+        std::unordered_multimap m(42); // expected-error-re{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size, Hash)
         std::unordered_multimap m(42, std::hash<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred)
         std::unordered_multimap m(42, std::hash<int>(), std::equal_to<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred, Allocator)
         std::unordered_multimap m(42, std::hash<int>(), std::equal_to<int>(), std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Allocator)
         std::unordered_multimap m(std::allocator<P>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size, Allocator)
         std::unordered_multimap m(42, std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Allocator)
         std::unordered_multimap m(42, std::hash<int>(), std::allocator<P>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multimap'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multimap'}}
     }
 
   return 0;
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.verify.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.verify.cpp
index 16dd268f4b08a7..8fd7d1d6c20efa 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.verify.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.verify.cpp
@@ -55,42 +55,42 @@ int main(int, char**)
     {
         // cannot deduce Key from nothing
         std::unordered_multiset s;
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size)
         std::unordered_multiset s(42);
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size, Hash)
         std::unordered_multiset s(42, std::hash<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred)
         std::unordered_multiset s(42, std::hash<int>(), std::equal_to<>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred, Allocator)
         std::unordered_multiset s(42, std::hash<int>(), std::equal_to<>(), std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Allocator)
         std::unordered_multiset s(std::allocator<int>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size, Allocator)
         std::unordered_multiset s(42, std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Allocator)
         std::unordered_multiset s(42, std::hash<short>(), std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_multiset'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_multiset'}}
     }
 
     return 0;
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.verify.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.verify.cpp
index d6082810216df8..26e5f9ae6ce430 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.verify.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.verify.cpp
@@ -55,42 +55,42 @@ int main(int, char**)
     {
         // cannot deduce Key from nothing
         std::unordered_set s;
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size)
         std::unordered_set s(42);
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size, Hash)
         std::unordered_set s(42, std::hash<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred)
         std::unordered_set s(42, std::hash<int>(), std::equal_to<>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Pred, Allocator)
         std::unordered_set s(42, std::hash<int>(), std::equal_to<>(), std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Allocator)
         std::unordered_set s(std::allocator<int>{});
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size, Allocator)
         std::unordered_set s(42, std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
     {
         // cannot deduce Key from just (Size, Hash, Allocator)
         std::unordered_set s(42, std::hash<short>(), std::allocator<int>());
-            // expected-error@-1{{no viable constructor or deduction guide for deduction of template arguments of 'unordered_set'}}
+            // expected-error-re@-1{{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}unordered_set'}}
     }
 
     return 0;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join/ctad.verify.cpp b/libcxx/test/std/ranges/range.adaptors/range.join/ctad.verify.cpp
index 2c6eea500580d6..de0c108e98336f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.join/ctad.verify.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.join/ctad.verify.cpp
@@ -27,5 +27,5 @@ struct Range {
 
 void testExplicitCTAD() {
   Range<Range<int>> r;
-  std::ranges::join_view v = r; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'join_view'}}
+  std::ranges::join_view v = r; // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::ranges::)?}}join_view'}}
 }
diff --git a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.verify.cpp b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.verify.cpp
index 593dd9d0ec51d9..8ef87cfc6468f5 100644
--- a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.verify.cpp
+++ b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.verify.cpp
@@ -27,13 +27,13 @@ int main(int, char**)
     {
     // basic_regex(ForwardIterator, ForwardIterator)
     // <int> is not an iterator
-    std::basic_regex re(23, 34);   // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'basic_regex'}}
+    std::basic_regex re(23, 34);   // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}basic_regex'}}
     }
 
     {
     // basic_regex(ForwardIterator, ForwardIterator, flag_type)
     // <double> is not an iterator
-    std::basic_regex re(23.0, 34.0, std::regex_constants::basic);   // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'basic_regex'}}
+    std::basic_regex re(23.0, 34.0, std::regex_constants::basic);   // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}basic_regex'}}
     }
 
     return 0;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.verify.cpp
index 0c887c683a8186..364f9b2e955f0c 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.verify.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.verify.cpp
@@ -25,7 +25,7 @@ int main(int, char**)
 //  Test the implicit deduction guides
     {
 //  optional()
-    std::optional opt;   // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'optional'}}
+    std::optional opt;   // expected-error-re {{no viable constructor or deduction guide for deduction of template arguments of '{{(std::)?}}optional'}}
     }
 
     {

From f68fdb84e1809caf42dd22b94145f06974429091 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 29 May 2024 22:02:27 +0200
Subject: [PATCH 169/230] DAG: Fix losing flags on select when expanding
 select_cc (#93662)

This was only preserving the flags on the setcc, not the new select.
This was missing presumably due to getSelect not having a flags argument
until recently. Avoids regressions in a future commit.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bfc2273c9425cf..51f2cf9017f85e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4159,7 +4159,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
              "expanded.");
       EVT CCVT = getSetCCResultType(CmpVT);
       SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC, Node->getFlags());
-      Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+      Results.push_back(
+          DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4, Node->getFlags()));
       break;
     }
 

From 4e67f45168b6ba95864285ba7f0ee313b084bdfb Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 29 May 2024 11:25:11 -0700
Subject: [PATCH 170/230] Reapply "[MTE] add stack frame history buffer"

In the reverted change, the order of the IR was dependent on the host
compiler, because we inserted instructions in arguments to functions.
Fix that, and also fix another problem with the test.

This reverts commit 3313f28897a87ec313ec0b52ef71c14d3b9ff652.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  3 +-
 .../Target/AArch64/AArch64StackTagging.cpp    | 64 ++++++++++++++++-
 .../CodeGen/AArch64/stack-tagging-prologue.ll | 69 +++++++++++++++++++
 3 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index dc7759367687b7..cd532671f50189 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2500,7 +2500,8 @@ AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return resolveFrameIndexReference(
       MF, FI, FrameReg,
       /*PreferFP=*/
-      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+      MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) ||
+          MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag),
       /*ForSimm=*/false);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index aabc5d5d22e2d3..fa0bb7b93e3bdd 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -11,6 +11,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -21,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -82,6 +84,26 @@ static cl::opt<size_t> ClMaxLifetimes(
     cl::desc("How many lifetime ends to handle for a single alloca."),
     cl::Optional);
 
+// Mode for selecting how to insert frame record info into the stack ring
+// buffer.
+enum RecordStackHistoryMode {
+  // Do not record frame record info.
+  none,
+
+  // Insert instructions into the prologue for storing into the stack ring
+  // buffer directly.
+  instr,
+};
+
+static cl::opt<RecordStackHistoryMode> ClRecordStackHistory(
+    "stack-tagging-record-stack-history",
+    cl::desc("Record stack frames with tagged allocations in a thread-local "
+             "ring buffer"),
+    cl::values(clEnumVal(none, "Do not record stack ring history"),
+               clEnumVal(instr, "Insert instructions into the prologue for "
+                                "storing into the stack ring buffer")),
+    cl::Hidden, cl::init(none));
+
 static const Align kTagGranuleSize = Align(16);
 
 namespace {
@@ -309,6 +331,7 @@ class AArch64StackTagging : public FunctionPass {
                                    uint64_t Size, InitializerBuilder &IB);
 
   Instruction *insertBaseTaggedPointer(
+      const Module &M,
       const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas,
       const DominatorTree *DT);
   bool runOnFunction(Function &F) override;
@@ -437,6 +460,7 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
 }
 
 Instruction *AArch64StackTagging::insertBaseTaggedPointer(
+    const Module &M,
     const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument,
     const DominatorTree *DT) {
   BasicBlock *PrologueBB = nullptr;
@@ -458,6 +482,41 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   Instruction *Base =
       IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
   Base->setName("basetag");
+  auto TargetTriple = Triple(M.getTargetTriple());
+  // This is not a stable ABI for now, so only allow in dev builds with API
+  // level 10000.
+  // The ThreadLong format is the same as with HWASan, but the entries for
+  // stack MTE take two slots (16 bytes).
+  if (ClRecordStackHistory == instr && TargetTriple.isAndroid() &&
+      TargetTriple.isAArch64() && !TargetTriple.isAndroidVersionLT(10000) &&
+      !AllocasToInstrument.empty()) {
+    constexpr int StackMteSlot = -3;
+    constexpr uint64_t TagMask = 0xFULL << 56;
+
+    auto *IntptrTy = IRB.getIntPtrTy(M.getDataLayout());
+    Value *SlotPtr = memtag::getAndroidSlotPtr(IRB, StackMteSlot);
+    auto *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+    Value *FP = memtag::getFP(IRB);
+    Value *Tag = IRB.CreateAnd(IRB.CreatePtrToInt(Base, IntptrTy), TagMask);
+    Value *TaggedFP = IRB.CreateOr(FP, Tag);
+    Value *PC = memtag::getPC(TargetTriple, IRB);
+    Value *RecordPtr = IRB.CreateIntToPtr(ThreadLong, IRB.getPtrTy(0));
+    IRB.CreateStore(PC, RecordPtr);
+    IRB.CreateStore(TaggedFP, IRB.CreateConstGEP1_64(IntptrTy, RecordPtr, 1));
+    // Update the ring buffer. Top byte of ThreadLong defines the size of the
+    // buffer in pages, it must be a power of two, and the start of the buffer
+    // must be aligned by twice that much. Therefore wrap around of the ring
+    // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
+    // The use of AShr instead of LShr is due to
+    //   https://bugs.llvm.org/show_bug.cgi?id=39030
+    // Runtime library makes sure not to use the highest bit.
+    Value *WrapMask = IRB.CreateXor(
+        IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
+        ConstantInt::get(IntptrTy, (uint64_t)-1));
+    Value *ThreadLongNew = IRB.CreateAnd(
+        IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 16)), WrapMask);
+    IRB.CreateStore(ThreadLongNew, SlotPtr);
+  }
   return Base;
 }
 
@@ -513,7 +572,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   SetTagFunc =
       Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
 
-  Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT);
+  Instruction *Base =
+      insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT);
 
   int NextTag = 0;
   for (auto &I : SInfo.AllocasToInstrument) {
@@ -575,6 +635,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       for (auto *II : Info.LifetimeEnd)
         II->eraseFromParent();
     }
+
+    memtag::annotateDebugRecords(Info, static_cast<unsigned long>(Tag));
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
new file mode 100644
index 00000000000000..26a0aa614c98b8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -stack-tagging-record-stack-history=instr -o - | FileCheck %s --check-prefixes=INSTR
+; RUN llc -mattr=+mte -stack-tagging-use-stack-safety=0 -stack-tagging-record-stack-history=instr %s -o - | FileCheck %s --check-prefixes=ASMINSTR
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare void @use8(ptr)
+declare void @use32(ptr)
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+
+define dso_local void @noUse32(ptr) sanitize_memtag {
+entry:
+  ret void
+}
+
+define void @OneVar() sanitize_memtag {
+entry:
+  %x = alloca i32, align 4
+  call void @use32(ptr %x)
+  ret void
+}
+
+; CHECK-LABEL: define void @OneVar(
+; CHECK:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
+; CHECK:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
+; CHECK:  ret void
+
+; INSTR-LABEL: define void @OneVar(
+; INSTR:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
+; INSTR:  [[TLS:%.*]] = call ptr @llvm.thread.pointer()
+; INSTR:  [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24
+; INSTR:  [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8
+; INSTR:  [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
+; INSTR:  [[FP_INT:%.*]] = ptrtoint ptr [[FP]] to i64
+; INSTR:  [[BASE_INT:%.*]] = ptrtoint ptr [[BASE]] to i64
+; INSTR:  [[BASE_TAG:%.*]] = and i64 [[BASE_INT]], 1080863910568919040
+; INSTR:  [[TAGGED_FP:%.*]] = or i64 [[FP_INT]], [[BASE_TAG]]
+; INSTR:  [[PC:%.*]] = call i64 @llvm.read_register.i64(metadata !0)
+; INSTR:  [[TLS_VALUE_PTR:%.*]] = inttoptr i64 [[TLS_VALUE]] to ptr
+; INSTR:  store i64 [[PC]], ptr [[TLS_VALUE_PTR]], align 8
+; INSTR:  [[SECOND_SLOT:%.*]] = getelementptr i64, ptr [[TLS_VALUE_PTR]], i64 1
+; INSTR:  store i64 [[TAGGED_FP]], ptr [[SECOND_SLOT]], align 8
+; INSTR:  [[SIZE_IN_PAGES:%.*]] = ashr i64 [[TLS_VALUE]], 56
+; INSTR:  [[WRAP_MASK_INTERMEDIARY:%.*]] = shl nuw nsw i64 [[SIZE_IN_PAGES]], 12
+; INSTR:  [[WRAP_MASK:%.*]] = xor i64 [[WRAP_MASK_INTERMEDIARY]], -1
+; INSTR:  [[NEXT_TLS_VALUE_BEFORE_WRAP:%.*]] = add i64 [[TLS_VALUE]], 16
+; INSTR:  [[NEXT_TLS_VALUE:%.*]] = and i64 [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[WRAP_MASK]]
+; INSTR:  store i64 [[NEXT_TLS_VALUE]], ptr [[TLS_SLOT]], align 8
+; INSTR:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; INSTR:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], ptr [[BASE]], i64 0)
+; INSTR:  [[PC:!.*]] = !{!"pc"}
+
+; ASMINSTR-LABEL: OneVar:
+; ASMINSTR:  mrs	[[TLS:x.*]], TPIDR_EL0
+; ASMINSTR:  irg	[[BASE:x.*]], sp
+; ASMINSTR:  adr	[[PC:x.*]], #0
+; ASMINSTR:  ldur	[[TLS_SLOT:x.*]], [[[TLS]], #-24]
+; ASMINSTR:  and	[[SP_TAG:x.*]], [[BASE]], #0xf00000000000000
+; ASMINSTR:  orr	[[TAGGED_FP]], x29, [[SP_TAG]]
+; ASMINSTR:  asr	[[TLS_SIZE:x.*]], [[TLS_SLOT]], #56
+; ASMINSTR:  add	[[NEXT_TLS_VALUE_BEFORE_WRAP:x.*]], [[TLS_SLOT]], #16
+; ASMINSTR:  stp	[[PC]], [[TAGGED_FP]], [[[TLS_SLOT]]]
+; ASMINSTR:  bic	[[NEXT_TLS_VALUE:x.*]], [[NEXT_TLS_VALUE_BEFORE_WRAP]], [[TLS_SIZE]], lsl #12
+; ASMINSTR:  stur	[[NEXT_TLS_VALUE]], [[[TLS]], #-24]
+; ASMINSTR:  stg	[[BASE]], [[[BASE]]]

From 428b9be6484404183f51de08d2503570bade2287 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 29 May 2024 13:05:44 -0700
Subject: [PATCH 171/230] [mlir] Align num elements type to LLVM ArrayType
 (#93230)

MLIR LLMArrayType is using `unsigned` for the number of elements while
LLVM ArrayType is using `uint64_t`
https://github.com/llvm/llvm-project/blob/4ae896fe979b7db501cabde4b6b3504478958682/llvm/include/llvm/IR/DerivedTypes.h#L377

This leads to silent truncation when we use it for globals in flang.

```
program test
  integer(8), parameter :: large = 2**30
  real,  dimension(large) :: bigarray
  common /c/ bigarray
  bigarray(999) = 666
end
```

The above program would result in a segfault since the global would be
of size 0 because of the silent truncation.

```
fir.global common @c_(dense<0> : vector<4294967296xi8>) : !fir.array<4294967296xi8>
```
became
```
llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) {addr_space = 0 : i32} : !llvm.array<0 x i8>
```

This patch updates the definition of MLIR ArrayType to take `uint64_t`
as argument of the number of elements to be compatible with LLVM.
---
 flang/test/Fir/convert-to-llvm.fir            |  6 +++
 mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td |  4 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp      |  6 +--
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 39 ++++++++++++++++++-
 mlir/test/Target/LLVMIR/llvmir.mlir           |  5 +++
 5 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 70cb0443e9a645..369d4bd3029bcc 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -2699,3 +2699,9 @@ func.func @coordinate_array_unknown_size_1d(%arg0: !fir.ptr<!fir.array<? x i32>>
 // CHECK:           %[[VAL_2:.*]] = llvm.getelementptr %[[VAL_0]]{{\[}}%[[VAL_1]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32
 // CHECK:           llvm.return
 // CHECK:         }
+
+// -----
+
+fir.global common @c_(dense<0> : vector<4294967296xi8>) : !fir.array<4294967296xi8>
+
+// CHECK: llvm.mlir.global common @c_(dense<0> : vector<4294967296xi8>) {addr_space = 0 : i32} : !llvm.array<4294967296 x i8>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
index b7176aa93ff1f7..8f9c2f2f8a0b44 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -40,7 +40,7 @@ def LLVMArrayType : LLVMType<"LLVMArray", "array", [
     ```
   }];
 
-  let parameters = (ins "Type":$elementType, "unsigned":$numElements);
+  let parameters = (ins "Type":$elementType, "uint64_t":$numElements);
   let assemblyFormat = [{
     `<` $numElements `x` custom<PrettyLLVMType>($elementType) `>`
   }];
@@ -49,7 +49,7 @@ def LLVMArrayType : LLVMType<"LLVMArray", "array", [
 
   let builders = [
     TypeBuilderWithInferredContext<(ins "Type":$elementType,
-                                        "unsigned":$numElements)>
+                                        "uint64_t":$numElements)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index ad1dc4a36b82b8..cf3f38b7101307 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -154,14 +154,14 @@ bool LLVMArrayType::isValidElementType(Type type) {
       type);
 }
 
-LLVMArrayType LLVMArrayType::get(Type elementType, unsigned numElements) {
+LLVMArrayType LLVMArrayType::get(Type elementType, uint64_t numElements) {
   assert(elementType && "expected non-null subtype");
   return Base::get(elementType.getContext(), elementType, numElements);
 }
 
 LLVMArrayType
 LLVMArrayType::getChecked(function_ref<InFlightDiagnostic()> emitError,
-                          Type elementType, unsigned numElements) {
+                          Type elementType, uint64_t numElements) {
   assert(elementType && "expected non-null subtype");
   return Base::getChecked(emitError, elementType.getContext(), elementType,
                           numElements);
@@ -169,7 +169,7 @@ LLVMArrayType::getChecked(function_ref<InFlightDiagnostic()> emitError,
 
 LogicalResult
 LLVMArrayType::verify(function_ref<InFlightDiagnostic()> emitError,
-                      Type elementType, unsigned numElements) {
+                      Type elementType, uint64_t numElements) {
   if (!isValidElementType(elementType))
     return emitError() << "invalid array element type: " << elementType;
   return success();
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 1ec0736ec08bfe..176821f82434d7 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -632,8 +632,43 @@ llvm::Constant *mlir::LLVM::detail::getLLVMConstant(
           llvm::ElementCount::get(numElements, /*Scalable=*/isScalable), child);
     if (llvmType->isArrayTy()) {
       auto *arrayType = llvm::ArrayType::get(elementType, numElements);
-      SmallVector<llvm::Constant *, 8> constants(numElements, child);
-      return llvm::ConstantArray::get(arrayType, constants);
+      if (child->isZeroValue()) {
+        return llvm::ConstantAggregateZero::get(arrayType);
+      } else {
+        if (llvm::ConstantDataSequential::isElementTypeCompatible(
+                elementType)) {
+          // TODO: Handle all compatible types. This code only handles integer.
+          if (llvm::IntegerType *iTy =
+                  dyn_cast<llvm::IntegerType>(elementType)) {
+            if (llvm::ConstantInt *ci = dyn_cast<llvm::ConstantInt>(child)) {
+              if (ci->getBitWidth() == 8) {
+                SmallVector<int8_t> constants(numElements, ci->getZExtValue());
+                return llvm::ConstantDataArray::get(elementType->getContext(),
+                                                    constants);
+              }
+              if (ci->getBitWidth() == 16) {
+                SmallVector<int16_t> constants(numElements, ci->getZExtValue());
+                return llvm::ConstantDataArray::get(elementType->getContext(),
+                                                    constants);
+              }
+              if (ci->getBitWidth() == 32) {
+                SmallVector<int32_t> constants(numElements, ci->getZExtValue());
+                return llvm::ConstantDataArray::get(elementType->getContext(),
+                                                    constants);
+              }
+              if (ci->getBitWidth() == 64) {
+                SmallVector<int64_t> constants(numElements, ci->getZExtValue());
+                return llvm::ConstantDataArray::get(elementType->getContext(),
+                                                    constants);
+              }
+            }
+          }
+        }
+        // std::vector is used here to accomodate large number of elements that
+        // exceed SmallVector capacity.
+        std::vector<llvm::Constant *> constants(numElements, child);
+        return llvm::ConstantArray::get(arrayType, constants);
+      }
     }
   }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 97f37939551d83..41a7eec1d8dfc2 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2396,3 +2396,8 @@ llvm.func @zeroinit_complex_local_aggregate() {
 llvm.linker_options ["/DEFAULTLIB:", "libcmt"]
 //CHECK: ![[MD1]] = !{!"/DEFAULTLIB:", !"libcmtd"}
 llvm.linker_options ["/DEFAULTLIB:", "libcmtd"]
+
+// -----
+
+// CHECK: @big_ = common global [4294967296 x i8] zeroinitializer
+llvm.mlir.global common @big_(dense<0> : vector<4294967296xi8>) {addr_space = 0 : i32} : !llvm.array<4294967296 x i8>

From 4b4d36654d8056546b177b3d04c352ba0b16d7ea Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 29 May 2024 18:48:27 +0100
Subject: [PATCH 172/230] [RISCV] Store only VNInfo val no in VSETVLIInfo. NFC

The VNInfo id (called val no elsewhere it seems) and register is enough
to uniquely identify AVL values, so try to store as little state as
possible.

This may also allow us to use dummy val nos in an upcoming patch when we
don't have LiveIntervals.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 32 +++++++++++---------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1a4f34b2d2215a..a63ce613c1f330 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -504,7 +504,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
 class VSETVLIInfo {
   struct AVLDef {
     // Every AVLDef should have a VNInfo.
-    const VNInfo *ValNo;
+    unsigned ValNo;
     Register DefReg;
   };
   union {
@@ -543,9 +543,9 @@ class VSETVLIInfo {
   void setUnknown() { State = Unknown; }
   bool isUnknown() const { return State == Unknown; }
 
-  void setAVLRegDef(const VNInfo *VNInfo, Register AVLReg) {
-    assert(VNInfo && AVLReg.isVirtual());
-    AVLRegDef.ValNo = VNInfo;
+  void setAVLRegDef(unsigned ValNo, Register AVLReg) {
+    assert(AVLReg.isVirtual());
+    AVLRegDef.ValNo = ValNo;
     AVLRegDef.DefReg = AVLReg;
     State = AVLIsReg;
   }
@@ -571,7 +571,7 @@ class VSETVLIInfo {
     assert(hasAVLImm());
     return AVLImm;
   }
-  const VNInfo *getAVLVNInfo() const {
+  unsigned getAVLValNo() const {
     assert(hasAVLReg());
     return AVLRegDef.ValNo;
   }
@@ -580,8 +580,10 @@ class VSETVLIInfo {
   // boundary slot.
   const MachineInstr *getAVLDefMI(const LiveIntervals *LIS) const {
     assert(hasAVLReg());
-    auto *MI = LIS->getInstructionFromIndex(getAVLVNInfo()->def);
-    assert(!(getAVLVNInfo()->isPHIDef() && MI));
+    const VNInfo *VNI =
+        LIS->getInterval(getAVLReg()).getValNumInfo(getAVLValNo());
+    auto *MI = LIS->getInstructionFromIndex(VNI->def);
+    assert(!(VNI->isPHIDef() && MI));
     return MI;
   }
 
@@ -590,7 +592,7 @@ class VSETVLIInfo {
     if (Info.isUnknown())
       setUnknown();
     else if (Info.hasAVLReg())
-      setAVLRegDef(Info.getAVLVNInfo(), Info.getAVLReg());
+      setAVLRegDef(Info.getAVLValNo(), Info.getAVLReg());
     else if (Info.hasAVLVLMAX())
       setAVLVLMAX();
     else if (Info.hasAVLIgnored())
@@ -629,7 +631,7 @@ class VSETVLIInfo {
 
   bool hasSameAVL(const VSETVLIInfo &Other) const {
     if (hasAVLReg() && Other.hasAVLReg())
-      return getAVLVNInfo()->id == Other.getAVLVNInfo()->id &&
+      return getAVLValNo() == Other.getAVLValNo() &&
              getAVLReg() == Other.getAVLReg();
 
     if (hasAVLImm() && Other.hasAVLImm())
@@ -927,7 +929,7 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
     else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS))
-      NewInfo.setAVLRegDef(VNI, AVLReg);
+      NewInfo.setAVLRegDef(VNI->id, AVLReg);
     else {
       assert(MI.getOperand(1).isUndef());
       NewInfo.setAVLIgnored();
@@ -1003,7 +1005,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       else
         InstrInfo.setAVLImm(Imm);
     } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) {
-      InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
+      InstrInfo.setAVLRegDef(VNI->id, VLOp.getReg());
     } else {
       assert(VLOp.isUndef());
       InstrInfo.setAVLIgnored();
@@ -1255,7 +1257,7 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
     auto &LI = LIS->getInterval(MI.getOperand(1).getReg());
     SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
     VNInfo *VNI = LI.getVNInfoAt(SI);
-    Info.setAVLRegDef(VNI, MI.getOperand(1).getReg());
+    Info.setAVLRegDef(VNI->id, MI.getOperand(1).getReg());
     return;
   }
 
@@ -1350,7 +1352,8 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
     return true;
 
   // We need the AVL to have been produced by a PHI node in this basic block.
-  const VNInfo *Valno = Require.getAVLVNInfo();
+  const VNInfo *Valno = LIS->getInterval(Require.getAVLReg())
+                            .getValNumInfo(Require.getAVLValNo());
   if (!Valno->isPHIDef() || LIS->getMBBFromIndex(Valno->def) != &MBB)
     return true;
 
@@ -1514,7 +1517,8 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
   // we need to prove the value is available at the point we're going
   // to insert the vsetvli at.
   if (AvailableInfo.hasAVLReg()) {
-    SlotIndex SI = AvailableInfo.getAVLVNInfo()->def;
+    const LiveInterval &LI = LIS->getInterval(AvailableInfo.getAVLReg());
+    SlotIndex SI = LI.getValNumInfo(AvailableInfo.getAVLValNo())->def;
     // This is an inline dominance check which covers the case of
     // UnavailablePred being the preheader of a loop.
     if (LIS->getMBBFromIndex(SI) != UnavailablePred)

From 3255752cbdd73595605439ad66aa4273a444af17 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 29 May 2024 13:17:19 -0700
Subject: [PATCH 173/230] [Github]Update PGO with more filepaths (#93720)

- `llvm/**/ProfileData/**/*` intends to cover `llvm/include/llvm/ProfileData/` and `llvm/lib/ProfileData/`
- `llvm/**/SampleProfile*` intends to cover a bunch of SamplePGO files and their headers. For example,
    `SampleProfile.cpp`, `SampleProfileMatcher.cpp`, `SampleProfileProbe.cpp`
- `llvm/**/CodeGen/MIRSampleProfile*` intends to cover MIRSampleProfile.cpp and its header.
- `llvm/test/Transforms/SampleProfile/**/*` intends to cover unit tests.
---
 .github/new-prs-labeler.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index a57ba28faf160b..0aa7f761ee0ab3 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -54,6 +54,9 @@ llvm-lit:
   - llvm/utils/lit/**/*
 
 PGO:
+  - llvm/**/ProfileData/**/*
+  - llvm/**/SampleProfile*
+  - llvm/**/CodeGen/MIRSampleProfile*
   - llvm/lib/Transforms/Instrumentation/CGProfile.cpp
   - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
   - llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -62,9 +65,9 @@ PGO:
   - llvm/lib/Transforms/Instrumentation/ValueProfile*
   - llvm/test/Instrumentation/InstrProfiling/**/*
   - llvm/test/Transforms/PGOProfile/**/*
+  - llvm/test/Transforms/SampleProfile/**/*
   - llvm/**/llvm-profdata/**/*
   - llvm/**/llvm-profgen/**/*
-  - llvm/unittests/ProfileData/**/*
 
 vectorization:
   - llvm/lib/Transforms/Vectorize/**/*

From 3fcf36363d0dc18f9782538897c13ff27e23d7a9 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 29 May 2024 21:10:28 +0100
Subject: [PATCH 174/230] [RISCV] Make some static functions in
 RISCVInsertVSETVLI methods. NFC

So we don't have to thread through some common arguments, and to allow
some methods to access state in an upcoming patch.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 48 +++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a63ce613c1f330..f350644d4512e6 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -902,7 +902,14 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   void emitVSETVLIs(MachineBasicBlock &MBB);
   void doPRE(MachineBasicBlock &MBB);
   void insertReadVL(MachineBasicBlock &MBB);
+
+  bool canMutatePriorConfig(const MachineInstr &PrevMI, const MachineInstr &MI,
+                            const DemandedFields &Used) const;
   void coalesceVSETVLIs(MachineBasicBlock &MBB) const;
+
+  VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
+  VSETVLIInfo computeInfoForInstr(const MachineInstr &MI,
+                                  uint64_t TSFlags) const;
 };
 
 } // end anonymous namespace
@@ -915,8 +922,8 @@ INITIALIZE_PASS(RISCVInsertVSETVLI, DEBUG_TYPE, RISCV_INSERT_VSETVLI_NAME,
 
 // Return a VSETVLIInfo representing the changes made by this VSETVLI or
 // VSETIVLI instruction.
-static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
-                                     const LiveIntervals *LIS) {
+VSETVLIInfo
+RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
@@ -950,9 +957,8 @@ static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
   return VLEN/SEW;
 }
 
-static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
-                                       const RISCVSubtarget &ST,
-                                       const LiveIntervals *LIS) {
+VSETVLIInfo RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI,
+                                                    uint64_t TSFlags) const {
   VSETVLIInfo InstrInfo;
 
   bool TailAgnostic = true;
@@ -996,8 +1002,8 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       if (Imm == RISCV::VLMaxSentinel) {
         // If we know the exact VLEN, see if we can use the constant encoding
         // for the VLMAX instead.  This reduces register pressure slightly.
-        const unsigned VLMAX = computeVLMAX(ST.getRealMaxVLen(), SEW, VLMul);
-        if (ST.getRealMinVLen() == ST.getRealMaxVLen() && VLMAX <= 31)
+        const unsigned VLMAX = computeVLMAX(ST->getRealMaxVLen(), SEW, VLMul);
+        if (ST->getRealMinVLen() == ST->getRealMaxVLen() && VLMAX <= 31)
           InstrInfo.setAVLImm(VLMAX);
         else
           InstrInfo.setAVLVLMAX();
@@ -1031,7 +1037,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   if (InstrInfo.hasAVLReg()) {
     if (const MachineInstr *DefMI = InstrInfo.getAVLDefMI(LIS);
         DefMI && isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI, LIS);
+      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
       if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
           (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
         InstrInfo.setAVL(DefInstrInfo);
@@ -1072,7 +1078,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg()) {
       if (const MachineInstr *DefMI = Info.getAVLDefMI(LIS);
           DefMI && isVectorConfigInstr(*DefMI)) {
-        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, LIS);
+        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
         if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
           auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
                         .addReg(RISCV::X0, RegState::Define | RegState::Dead)
@@ -1160,7 +1166,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
   if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
     if (const MachineInstr *DefMI = Require.getAVLDefMI(LIS);
         DefMI && isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, LIS);
+      VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
       if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
         return false;
     }
@@ -1198,7 +1204,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
 
   DemandedFields Demanded = getDemanded(MI, ST);
 
-  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, LIS);
+  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags);
   assert(NewInfo.isValid() && !NewInfo.isUnknown());
   if (Info.isValid() && !needVSETVLI(Demanded, NewInfo, Info))
     return;
@@ -1247,7 +1253,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
 void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
                                        const MachineInstr &MI) const {
   if (isVectorConfigInstr(MI)) {
-    Info = getInfoForVSETVLI(MI, LIS);
+    Info = getInfoForVSETVLI(MI);
     return;
   }
 
@@ -1372,7 +1378,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 
     // We found a VSET(I)VLI make sure it matches the output of the
     // predecessor block.
-    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, LIS);
+    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
     if (DefInfo != PBBExit)
       return true;
 
@@ -1583,11 +1589,9 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
 
 // Return true if we can mutate PrevMI to match MI without changing any the
 // fields which would be observed.
-static bool canMutatePriorConfig(const MachineInstr &PrevMI,
-                                 const MachineInstr &MI,
-                                 const DemandedFields &Used,
-                                 const MachineRegisterInfo &MRI,
-                                 const LiveIntervals *LIS) {
+bool RISCVInsertVSETVLI::canMutatePriorConfig(
+    const MachineInstr &PrevMI, const MachineInstr &MI,
+    const DemandedFields &Used) const {
   // If the VL values aren't equal, return false if either a) the former is
   // demanded, or b) we can't rewrite the former to be the later for
   // implementation reasons.
@@ -1598,8 +1602,8 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
     if (Used.VLZeroness) {
       if (isVLPreservingConfig(PrevMI))
         return false;
-      if (!getInfoForVSETVLI(PrevMI, LIS)
-               .hasEquallyZeroAVL(getInfoForVSETVLI(MI, LIS), LIS))
+      if (!getInfoForVSETVLI(PrevMI).hasEquallyZeroAVL(getInfoForVSETVLI(MI),
+                                                       LIS))
         return false;
     }
 
@@ -1609,7 +1613,7 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
     // If the AVL is a register, we need to make sure MI's AVL dominates PrevMI.
     // For now just check that PrevMI uses the same virtual register.
     if (AVL.isReg() && AVL.getReg() != RISCV::X0 &&
-        (!MRI.hasOneDef(AVL.getReg()) || !PrevAVL.isReg() ||
+        (!MRI->hasOneDef(AVL.getReg()) || !PrevAVL.isReg() ||
          PrevAVL.getReg() != AVL.getReg()))
       return false;
   }
@@ -1649,7 +1653,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
         continue;
       }
 
-      if (canMutatePriorConfig(MI, *NextMI, Used, *MRI, LIS)) {
+      if (canMutatePriorConfig(MI, *NextMI, Used)) {
         if (!isVLPreservingConfig(*NextMI)) {
           Register DefReg = NextMI->getOperand(0).getReg();
 

From 87e8ce376771f8c88a12776544cd81ec5a4993fb Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Wed, 29 May 2024 13:27:44 -0700
Subject: [PATCH 175/230] [llvm] Re-use original global name in
 RelLookupTableConverter (#93626)

Prior, the reltable we create was "reltable." + FuncName which can
result in multiple tables named "reltable." + FuncName + ".{number}" if
we substitute multiple tables in a function. Since we replace the
original global, it makes it easier to just take over the original
global's name. Functionally, this doesn't change the IR emitted, just
global names.

This is a subset of PR 93355 that I'm breaking into multiple patches.
---
 .../Utils/RelLookupTableConverter.cpp         |  8 +--
 .../RelLookupTableConverter/X86/opaque-ptr.ll |  2 +-
 .../X86/relative_lookup_table.ll              | 60 +++++++++----------
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index ea628d7c3d7d6b..6e84965370b248 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -100,10 +100,10 @@ static GlobalVariable *createRelLookupTable(Function &Func,
       ArrayType::get(Type::getInt32Ty(M.getContext()), NumElts);
 
   GlobalVariable *RelLookupTable = new GlobalVariable(
-    M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(),
-    nullptr, "reltable." + Func.getName(), &LookupTable,
-    LookupTable.getThreadLocalMode(), LookupTable.getAddressSpace(),
-    LookupTable.isExternallyInitialized());
+      M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(),
+      nullptr, LookupTable.getName() + ".rel", &LookupTable,
+      LookupTable.getThreadLocalMode(), LookupTable.getAddressSpace(),
+      LookupTable.isExternallyInitialized());
 
   uint64_t Idx = 0;
   SmallVector<Constant *, 64> RelLookupTableContents(NumElts);
diff --git a/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll b/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll
index b60f447a567741..d1b6757b6db1c9 100644
--- a/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll
@@ -15,7 +15,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define ptr @test(i32 %cond) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[COND:%.*]], 2
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.test, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @table1.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
 ;
   %switch.gep = getelementptr inbounds [3 x ptr], ptr @table1, i32 0, i32 %cond
diff --git a/llvm/test/Transforms/RelLookupTableConverter/X86/relative_lookup_table.ll b/llvm/test/Transforms/RelLookupTableConverter/X86/relative_lookup_table.ll
index 9e433e9a903553..827dc61121909f 100644
--- a/llvm/test/Transforms/RelLookupTableConverter/X86/relative_lookup_table.ll
+++ b/llvm/test/Transforms/RelLookupTableConverter/X86/relative_lookup_table.ll
@@ -73,51 +73,51 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @switch.table.external_linkage = private unnamed_addr constant [3 x ptr] [ptr @a1, ptr @b1, ptr @c1], align
 
 ; Lookup table check for integer pointers that have internal linkage
-; CHECK: @reltable.internal_linkage = private unnamed_addr constant [3 x i32]
+; CHECK: @switch.table.internal_linkage.rel = private unnamed_addr constant [3 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @a2 to i64), i64 ptrtoint (ptr @reltable.internal_linkage to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @b2 to i64), i64 ptrtoint (ptr @reltable.internal_linkage to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @c2 to i64), i64 ptrtoint (ptr @reltable.internal_linkage to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @a2 to i64), i64 ptrtoint (ptr @switch.table.internal_linkage.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @b2 to i64), i64 ptrtoint (ptr @switch.table.internal_linkage.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @c2 to i64), i64 ptrtoint (ptr @switch.table.internal_linkage.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 
 ; Relative switch lookup table for strings
-; CHECK: @reltable.string_table = private unnamed_addr constant [3 x i32]
+; CHECK: @switch.table.string_table.rel = private unnamed_addr constant [3 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @reltable.string_table to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.1 to i64), i64 ptrtoint (ptr @reltable.string_table to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @reltable.string_table to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @switch.table.string_table.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.1 to i64), i64 ptrtoint (ptr @switch.table.string_table.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @switch.table.string_table.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 
 ; Relative switch lookup table for strings with holes, where holes are filled with relative offset to default values
-; CHECK: @reltable.string_table_holes = private unnamed_addr constant [4 x i32]
+; CHECK: @switch.table.string_table_holes.rel = private unnamed_addr constant [4 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @reltable.string_table_holes to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.3 to i64), i64 ptrtoint (ptr @reltable.string_table_holes to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @reltable.string_table_holes to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.4 to i64), i64 ptrtoint (ptr @reltable.string_table_holes to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @switch.table.string_table_holes.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.3 to i64), i64 ptrtoint (ptr @switch.table.string_table_holes.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @switch.table.string_table_holes.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.4 to i64), i64 ptrtoint (ptr @switch.table.string_table_holes.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 
 ; Single value check
-; CHECK: @reltable.single_value = private unnamed_addr constant [3 x i32]
+; CHECK: @switch.table.single_value.rel = private unnamed_addr constant [3 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @reltable.single_value to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.1 to i64), i64 ptrtoint (ptr @reltable.single_value to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @reltable.single_value to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str to i64), i64 ptrtoint (ptr @switch.table.single_value.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.1 to i64), i64 ptrtoint (ptr @switch.table.single_value.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.2 to i64), i64 ptrtoint (ptr @switch.table.single_value.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 ;
 
 ; Relative lookup table for the loop hoist check test
-; CHECK: @reltable.loop_hoist = internal unnamed_addr constant [2 x i32]
+; CHECK: @table.rel = internal unnamed_addr constant [2 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.8 to i64), i64 ptrtoint (ptr @reltable.loop_hoist to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.9 to i64), i64 ptrtoint (ptr @reltable.loop_hoist to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.8 to i64), i64 ptrtoint (ptr @table.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.9 to i64), i64 ptrtoint (ptr @table.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 
 ; Relative look up table for the test where gep is not immediately followed by a load check
-; CHECK: @reltable.gep_is_not_imm_followed_by_load = internal unnamed_addr constant [2 x i32]
+; CHECK: @table2.rel = internal unnamed_addr constant [2 x i32]
 ; CHECK-SAME: [
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.8 to i64), i64 ptrtoint (ptr @reltable.gep_is_not_imm_followed_by_load to i64)) to i32),
-; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.9 to i64), i64 ptrtoint (ptr @reltable.gep_is_not_imm_followed_by_load to i64)) to i32)
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.8 to i64), i64 ptrtoint (ptr @table2.rel to i64)) to i32),
+; CHECK-SAME: i32 trunc (i64 sub (i64 ptrtoint (ptr @.str.9 to i64), i64 ptrtoint (ptr @table2.rel to i64)) to i32)
 ; CHECK-SAME: ], align 4
 
 ; Lookup table check for integer pointers that have external linkage
@@ -154,7 +154,7 @@ define ptr @internal_linkage(i32 %cond) {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 %cond, 2
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.internal_linkage, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @switch.table.internal_linkage.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret ptr @d2
@@ -180,7 +180,7 @@ define ptr @string_table(i32 %cond) {
   ; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
   ; CHECK:       switch.lookup:
   ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 %cond, 2
-  ; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.string_table, i32 [[RELTABLE_SHIFT]])
+  ; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @switch.table.string_table.rel, i32 [[RELTABLE_SHIFT]])
   ; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
   ; CHECK:       return:
   ; CHECK-NEXT:    ret ptr @.str.3
@@ -206,7 +206,7 @@ define ptr @string_table_holes(i32 %cond) {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[COND]], 2
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.string_table_holes, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @switch.table.string_table_holes.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret ptr @.str.3
@@ -235,7 +235,7 @@ define void @single_value(i32 %cond)  {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[COND]], 2
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.single_value, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @switch.table.single_value.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK:       sw.epilog:
 ; CHECK-NEXT:   [[STR1:%.*]] = phi ptr [ @.str.5, %entry ], [ @.str.7, %switch.lookup ]
 ; CHECK-NEXT:   [[STR2:%.*]] = phi ptr [ @.str.6, %entry ], [ [[RELTABLE_INTRINSIC]], [[SWITCH_LOOKUP]] ]
@@ -265,7 +265,7 @@ define ptr @user_defined_lookup_table(i32 %cond)  {
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    [[IDX_PROM:%.*]] = sext i32 [[COND]] to i64
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i64 [[IDX_PROM]], 2
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @reltable.user_defined_lookup_table, i64 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i64(ptr @user_defined_lookup_table.table.rel, i64 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    br label %cond.end
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    [[COND1:%.*]] = phi ptr [ [[RELTABLE_INTRINSIC]], %cond.false ], [ @.str.3, %entry ]
@@ -296,7 +296,7 @@ define ptr @loop_hoist(i32 %x) {
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[X:%.*]], 2
 ; CHECK-NEXT:    br i1 [[TMP0]], label %if.done, label %if.false
 ; CHECK:       if.false:
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.loop_hoist, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @table.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    br label %if.done
 ; CHECK:       if.done:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi ptr [ @.str.10, %entry ], [ [[RELTABLE_INTRINSIC]], %if.false ]
@@ -327,7 +327,7 @@ define ptr @gep_is_not_imm_followed_by_load(i32 %x) {
 ; CHECK:       entry:
 ; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @may_not_return()
-; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.gep_is_not_imm_followed_by_load, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @table2.rel, i32 [[RELTABLE_SHIFT]])
 ; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
 ;
 entry:

From 025394fa0dd3d0c20cc755f79ed521b85e5d7943 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Wed, 29 May 2024 13:28:32 -0700
Subject: [PATCH 176/230] Reapply "[lld] Support thumb PLTs" (#93631) (#93644)

This reverts commit 7832769d329ead264aff238c06dce086b3a74922.

This was reverted prior due to a test failure on the windows builder. I
think this was because we didn't specify the triple and assumed windows.
The other tests use the full triple specifying linux, so we follow suite
here.

---

We are using PLTs for cortex-m33 which only supports thumb. More
specifically, this is for a very restricted use case. There's no MMU so
there's no sharing of virtual addresses between two processes, but this
is fine. The MCU is used for running [chre
nanoapps](https://android.googlesource.com/platform/system/chre/+/HEAD/doc/nanoapp_overview.md)
for android. Each nanoapp is a shared library (but effectively acts as
an executable containing a test suite) that is loaded and run on the MCU
one binary at a time and there's only one process running at a time, so
we ensure that the same text segment cannot be shared by two different
running executables. GNU LD supports thumb PLTs but we want to migrate
to a clang toolchain and use LLD, so thumb PLTs are needed.
---
 lld/ELF/Arch/ARM.cpp                 | 176 +++++++++++++++++++--------
 lld/ELF/Config.h                     |   1 +
 lld/ELF/InputFiles.cpp               |  12 ++
 lld/test/ELF/armv8-thumb-plt-reloc.s | 126 +++++++++++++++++++
 4 files changed, 262 insertions(+), 53 deletions(-)
 create mode 100644 lld/test/ELF/armv8-thumb-plt-reloc.s

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 687f9499009d5e..3e0efe540e1bf1 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) {
 // The default PLT header requires the .got.plt to be within 128 Mb of the
 // .plt in the positive direction.
 void ARM::writePltHeader(uint8_t *buf) const {
-  // Use a similar sequence to that in writePlt(), the difference is the calling
-  // conventions mean we use lr instead of ip. The PLT entry is responsible for
-  // saving lr on the stack, the dynamic loader is responsible for reloading
-  // it.
-  const uint32_t pltData[] = {
-      0xe52de004, // L1: str lr, [sp,#-4]!
-      0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
-      0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
-      0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
-  };
-
-  uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltHeaderLong(buf);
-    return;
+  if (config->armThumbPLTs) {
+    // The instruction sequence for thumb:
+    //
+    // 0: b500          push    {lr}
+    // 2: f8df e008     ldr.w   lr, [pc, #0x8]          @ 0xe <func+0xe>
+    // 6: 44fe          add     lr, pc
+    // 8: f85e ff08     ldr     pc, [lr, #8]!
+    // e:               .word   .got.plt - .plt - 16
+    //
+    // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
+    // `pc` in the add instruction and 8 bytes for the `lr` adjustment.
+    //
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+    write16(buf + 0, 0xb500);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 2, 0xf8df);
+    write16(buf + 4, 0xe008);
+    write16(buf + 6, 0x44fe);
+    // Split into two halves to support endianness correctly.
+    write16(buf + 8, 0xf85e);
+    write16(buf + 10, 0xff08);
+    write32(buf + 12, offset);
+
+    memcpy(buf + 16, trapInstr.data(), 4);  // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
+  } else {
+    // Use a similar sequence to that in writePlt(), the difference is the
+    // calling conventions mean we use lr instead of ip. The PLT entry is
+    // responsible for saving lr on the stack, the dynamic loader is responsible
+    // for reloading it.
+    const uint32_t pltData[] = {
+        0xe52de004, // L1: str lr, [sp,#-4]!
+        0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
+        0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
+        0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
+    };
+
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltHeaderLong(buf);
+      return;
+    }
+    write32(buf + 0, pltData[0]);
+    write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
+    write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
+    write32(buf + 12, pltData[3] | (offset & 0xfff));
+    memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
   }
-  write32(buf + 0, pltData[0]);
-  write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
-  write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
-  write32(buf + 12, pltData[3] | (offset & 0xfff));
-  memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
-  memcpy(buf + 20, trapInstr.data(), 4);
-  memcpy(buf + 24, trapInstr.data(), 4);
-  memcpy(buf + 28, trapInstr.data(), 4);
 }
 
 void ARM::addPltHeaderSymbols(InputSection &isec) const {
-  addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  }
 }
 
 // Long form PLT entries that do not have any restrictions on the displacement
@@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
 // .plt in the positive direction.
 void ARM::writePlt(uint8_t *buf, const Symbol &sym,
                    uint64_t pltEntryAddr) const {
-  // The PLT entry is similar to the example given in Appendix A of ELF for
-  // the Arm Architecture. Instead of using the Group Relocations to find the
-  // optimal rotation for the 8-bit immediate used in the add instructions we
-  // hard code the most compact rotations for simplicity. This saves a load
-  // instruction over the long plt sequences.
-  const uint32_t pltData[] = {
-      0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
-      0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
-      0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
-  };
 
-  uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
-    return;
+  if (!config->armThumbPLTs) {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
+
+    // The PLT entry is similar to the example given in Appendix A of ELF for
+    // the Arm Architecture. Instead of using the Group Relocations to find the
+    // optimal rotation for the 8-bit immediate used in the add instructions we
+    // hard code the most compact rotations for simplicity. This saves a load
+    // instruction over the long plt sequences.
+    const uint32_t pltData[] = {
+        0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
+        0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
+        0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
+    };
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
+      return;
+    }
+    write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
+    write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
+    write32(buf + 8, pltData[2] | (offset & 0xfff));
+    memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
+  } else {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+
+    // A PLT entry will be:
+    //
+    //       movw ip, #<lower 16 bits>
+    //       movt ip, #<upper 16 bits>
+    //       add ip, pc
+    //   L1: ldr.w pc, [ip]
+    //       b L1
+    //
+    // where ip = r12 = 0xc
+
+    // movw ip, #<lower 16 bits>
+    write16(buf + 2, 0x0c00); // use `ip`
+    relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);
+
+    // movt ip, #<upper 16 bits>
+    write16(buf + 6, 0x0c00); // use `ip`
+    relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);
+
+    write16(buf + 8, 0x44fc);       // add ip, pc
+    write16(buf + 10, 0xf8dc);      // ldr.w   pc, [ip] (bottom half)
+    write16(buf + 12, 0xf000);      // ldr.w   pc, [ip] (upper half)
+    write16(buf + 14, 0xe7fc);      // Branch to previous instruction
   }
-  write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
-  write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
-  write32(buf + 8, pltData[2] | (offset & 0xfff));
-  memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
 }
 
 void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
-  addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  if (config->armThumbPLTs) {
+    addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
+  } else {
+    addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  }
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   case R_ARM_JUMP24:
     // Source is ARM, all PLT entries are ARM so no interworking required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
+    assert(!config->armThumbPLTs &&
+           "If the source is ARM, we should not need Thumb PLTs");
     if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
       return true;
     [[fallthrough]];
@@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   }
   case R_ARM_THM_JUMP19:
   case R_ARM_THM_JUMP24:
-    // Source is Thumb, all PLT entries are ARM so interworking is required.
+    // Source is Thumb, when all PLT entries are ARM interworking is required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
-    if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
+    if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
       return true;
     [[fallthrough]];
   case R_ARM_THM_CALL: {
@@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // STT_FUNC we choose whether to write a BL or BLX depending on the
     // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
     // not of type STT_FUNC then we must preserve the original instruction.
-    // PLT entries are always ARM state so we know we don't need to interwork.
     assert(rel.sym); // R_ARM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
     bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // PLT entries are always ARM state so we know we need to interwork.
     assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
+    bool useThumb = bit0Thumb || config->armThumbPLTs;
     bool isBlx = (read16(loc + 2) & 0x1000) == 0;
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
-    // even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
-    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
+    // even when type not STT_FUNC.
+    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
       stateChangeWarning(loc, rel.type, *rel.sym);
-    if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
+    if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
       // before overflow check.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index f0dfe7f377de0e..883c4a2f84294c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -217,6 +217,7 @@ struct Config {
   bool allowMultipleDefinition;
   bool fatLTOObjects;
   bool androidPackDynRelocs = false;
+  bool armThumbPLTs = false;
   bool armHasBlx = false;
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 1f496026d3ae20..d760dddcf5ec5c 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
   if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
       profile == ARMBuildAttrs::MicroControllerProfile)
     config->armCMSESupport = true;
+
+  // The thumb PLT entries require Thumb2 which can be used on multiple archs.
+  // For now, let's limit it to ones where ARM isn't available and we know have
+  // Thumb2.
+  std::optional<unsigned> armISA =
+      attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
+  std::optional<unsigned> thumb =
+      attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
+  bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
+  bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
+  if (noArmISA && hasThumb2)
+    config->armThumbPLTs = true;
 }
 
 InputFile::InputFile(Kind k, MemoryBufferRef m)
diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s
new file mode 100644
index 00000000000000..5b6e4b5fdd139b
--- /dev/null
+++ b/lld/test/ELF/armv8-thumb-plt-reloc.s
@@ -0,0 +1,126 @@
+// REQUIRES: arm
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.o
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumb --mcpu=cortex-m33 %s -o %t2.o
+// RUN: ld.lld %t1.o %t2.o -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld -shared %t1.o %t2.o -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be.o
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=thumbv8-none-linux-gnueabi --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be.o
+// RUN: ld.lld %t1.be.o %t2.be.o -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld -shared %t1.be.o %t2.be.o -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+// RUN: ld.lld --be8 %t1.be.o %t2.be.o -o %t.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
+// RUN: ld.lld --be8 -shared %t1.be.o %t2.be.o -o %t.so.be
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readelf -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
+
+/// Test PLT entry generation
+ .text
+ .align 2
+ .globl _start
+ .type  _start,%function
+_start:
+ bl func1
+ bl func2
+ bl func3
+ b.w func1
+ b.w func2
+ b.w func3
+ beq.w func1
+ beq.w func2
+ beq.w func3
+
+/// Executable, expect no PLT
+// CHECK: Disassembly of section .text:
+// CHECK-EMPTY:
+// CHECK-NEXT: <func1>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func2>:
+// CHECK-NEXT:   bx      lr
+// CHECK: <func3>:
+// CHECK-NEXT:   bx      lr
+// CHECK-NEXT:   d4d4 
+// CHECK: <_start>:
+// CHECK-NEXT:   bl      {{.*}} <func1>
+// CHECK-NEXT:   bl      {{.*}} <func2>
+// CHECK-NEXT:   bl      {{.*}} <func3>
+// CHECK-NEXT:   b.w     {{.*}} <func1>
+// CHECK-NEXT:   b.w     {{.*}} <func2>
+// CHECK-NEXT:   b.w     {{.*}} <func3>
+// CHECK-NEXT:   beq.w	 {{.*}} <func1>
+// CHECK-NEXT:   beq.w	 {{.*}} <func2>
+// CHECK-NEXT:   beq.w	 {{.*}} <func3>
+
+// DSO: Disassembly of section .text:
+// DSO-EMPTY:
+// DSO-NEXT: <func1>:
+// DSO-NEXT:     bx      lr
+// DSO: <func2>:
+// DSO-NEXT:     bx      lr
+// DSO: <func3>:
+// DSO-NEXT:     bx      lr
+// DSO-NEXT:     d4d4 
+// DSO: <_start>:
+/// 0x10260 = PLT func1
+// DSO-NEXT:     bl     0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     bl     0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     bl     0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     b.w    0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     b.w    0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     b.w    0x10280
+/// 0x10260 = PLT func1
+// DSO-NEXT:     beq.w	 0x10260
+/// 0x10270 = PLT func2
+// DSO-NEXT:     beq.w	 0x10270
+/// 0x10280 = PLT func3
+// DSO-NEXT:     beq.w	 0x10280
+// DSO: Disassembly of section .plt:
+// DSO-EMPTY:
+// DSO-NEXT: 10240 <.plt>:
+// DSO-NEXT:     push    {lr}
+// DSO-NEXT:     ldr.w   lr, [pc, #8]
+// DSO-NEXT:     add     lr, pc
+// DSO-NEXT:     ldr     pc, [lr, #8]!
+/// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
+// DSO-NEXT:     .word   0x00020098
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+// DSO-NEXT:     .word   0xd4d4d4d4
+
+/// 136 + 2 << 16 + 0x1026c = 0x302f4 = got entry 1
+// DSO-NEXT:     10260:       f240 0c88     movw    r12, #136
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1026a
+/// 124 + 2 << 16 + 0x1027c = 0x302f8 = got entry 2
+// DSO-NEXT:     10270:       f240 0c7c     movw    r12, #124
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1027a
+/// 112 + 2 << 16 + 0x1028c = 0x302fc = got entry 3
+// DSO-NEXT:     10280:       f240 0c70     movw    r12, #112
+// DSO-NEXT:                  f2c0 0c02     movt    r12, #2
+// DSO-NEXT:                  44fc          add     r12, pc
+// DSO-NEXT:                  f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:                  e7fc          b       0x1028a
+
+// DSOREL: .got.plt PROGBITS 000302e8 {{.*}} 000018 00  WA  0   0  4
+// DSOREL: Relocation section '.rel.plt'
+// DSOREL: 000302f4 {{.*}} R_ARM_JUMP_SLOT {{.*}} func1
+// DSOREL: 000302f8 {{.*}} R_ARM_JUMP_SLOT {{.*}} func2
+// DSOREL: 000302fc {{.*}} R_ARM_JUMP_SLOT {{.*}} func3

From 5bec47c1ef6468ea1e9b24fc7126424760306615 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 14:29:01 -0600
Subject: [PATCH 177/230] Revert "[mlir][spirv] Add integration test for
 `vector.interleave` and `vector.shuffle`" (#93732)

Reverts llvm/llvm-project#93595

This broke the gcc-7 bot.
---
 .../Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp  |  2 -
 .../mlir-vulkan-runner/vector-interleave.mlir | 53 -------------------
 .../mlir-vulkan-runner/vector-shuffle.mlir    | 53 -------------------
 3 files changed, 108 deletions(-)
 delete mode 100644 mlir/test/mlir-vulkan-runner/vector-interleave.mlir
 delete mode 100644 mlir/test/mlir-vulkan-runner/vector-shuffle.mlir

diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 53e73ec0d81bf0..1d1db913e3df23 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -18,7 +18,6 @@
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
 #include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
-#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRV.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
@@ -133,7 +132,6 @@ void GPUToSPIRVPass::runOnOperation() {
     mlir::arith::populateArithToSPIRVPatterns(typeConverter, patterns);
     populateMemRefToSPIRVPatterns(typeConverter, patterns);
     populateFuncToSPIRVPatterns(typeConverter, patterns);
-    populateVectorToSPIRVPatterns(typeConverter, patterns);
 
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
       return signalPassFailure();
diff --git a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir b/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
deleted file mode 100644
index 2f5c319e2f5c5d..00000000000000
--- a/mlir/test/mlir-vulkan-runner/vector-interleave.mlir
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: mlir-vulkan-runner %s \
-// RUN:  --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
-// RUN:  --entry-point-result=void | FileCheck %s
-
-// CHECK: [0, 2, 1, 3]
-module attributes {
-  gpu.container_module,
-  spirv.target_env = #spirv.target_env<
-    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
-} {
-  gpu.module @kernels {
-    gpu.func @kernel_vector_interleave(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<4xi32>)
-      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
-      %c0 = arith.constant 0 : index
-      %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32>
-      %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32>
-      %result = vector.interleave %vec0, %vec1 : vector<2xi32> -> vector<4xi32>
-      vector.store %result, %arg2[%c0] : memref<4xi32>, vector<4xi32>
-      gpu.return
-    }
-  }
-
-  func.func @main() {
-    // Allocate 3 buffers.
-    %buf0 = memref.alloc() : memref<2xi32>
-    %buf1 = memref.alloc() : memref<2xi32>
-    %buf2 = memref.alloc() : memref<4xi32>
-    
-    %idx0 = arith.constant 0 : index
-    %idx1 = arith.constant 1 : index
-    %idx4 = arith.constant 4 : index
-
-    // Initialize input buffer.
-    %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32>
-    %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32>
-    vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32>
-    vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32>
-
-    // Initialize output buffer.
-    %value0 = arith.constant 0 : i32
-    %buf3 = memref.cast %buf2 : memref<4xi32> to memref<?xi32>
-    call @fillResource1DInt(%buf3, %value0) : (memref<?xi32>, i32) -> ()
-
-    gpu.launch_func @kernels::@kernel_vector_interleave
-        blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1)
-        args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<4xi32>)
-    %buf4 = memref.cast %buf3 : memref<?xi32> to memref<*xi32>
-    call @printMemrefI32(%buf4) : (memref<*xi32>) -> ()
-    return
-  }
-  func.func private @fillResource1DInt(%0 : memref<?xi32>, %1 : i32)
-  func.func private @printMemrefI32(%ptr : memref<*xi32>)
-}
diff --git a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir b/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
deleted file mode 100644
index e29e054ccd46be..00000000000000
--- a/mlir/test/mlir-vulkan-runner/vector-shuffle.mlir
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: mlir-vulkan-runner %s \
-// RUN:  --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \
-// RUN:  --entry-point-result=void | FileCheck %s
-
-// CHECK: [2, 1, 3]
-module attributes {
-  gpu.container_module,
-  spirv.target_env = #spirv.target_env<
-    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
-} {
-  gpu.module @kernels {
-    gpu.func @kernel_vector_shuffle(%arg0 : memref<2xi32>, %arg1 : memref<2xi32>, %arg2 : memref<3xi32>)
-      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
-      %c0 = arith.constant 0 : index
-      %vec0 = vector.load %arg0[%c0] : memref<2xi32>, vector<2xi32>
-      %vec1 = vector.load %arg1[%c0] : memref<2xi32>, vector<2xi32>
-      %result = vector.shuffle %vec0, %vec1[2, 1, 3] : vector<2xi32>, vector<2xi32>
-      vector.store %result, %arg2[%c0] : memref<3xi32>, vector<3xi32>
-      gpu.return
-    }
-  }
-
-  func.func @main() {
-    // Allocate 3 buffers.
-    %buf0 = memref.alloc() : memref<2xi32>
-    %buf1 = memref.alloc() : memref<2xi32>
-    %buf2 = memref.alloc() : memref<3xi32>
-    
-    %idx0 = arith.constant 0 : index
-    %idx1 = arith.constant 1 : index
-    %idx4 = arith.constant 4 : index
-
-    // Initialize input buffer
-    %buf0_vals = arith.constant dense<[0, 1]> : vector<2xi32>
-    %buf1_vals = arith.constant dense<[2, 3]> : vector<2xi32>
-    vector.store %buf0_vals, %buf0[%idx0] : memref<2xi32>, vector<2xi32>
-    vector.store %buf1_vals, %buf1[%idx0] : memref<2xi32>, vector<2xi32>
-
-    // Initialize output buffer.
-    %value0 = arith.constant 0 : i32
-    %buf3 = memref.cast %buf2 : memref<3xi32> to memref<?xi32>
-    call @fillResource1DInt(%buf3, %value0) : (memref<?xi32>, i32) -> ()
-
-    gpu.launch_func @kernels::@kernel_vector_shuffle
-        blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1)
-        args(%buf0 : memref<2xi32>, %buf1 : memref<2xi32>, %buf2 : memref<3xi32>)
-    %buf4 = memref.cast %buf3 : memref<?xi32> to memref<*xi32>
-    call @printMemrefI32(%buf4) : (memref<*xi32>) -> ()
-    return
-  }
-  func.func private @fillResource1DInt(%0 : memref<?xi32>, %1 : i32)
-  func.func private @printMemrefI32(%ptr : memref<*xi32>)
-}

From 0d37e5c61797d6075f02daf28d1b16561299bc69 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 29 May 2024 21:28:44 +0100
Subject: [PATCH 178/230] [RISCV] Remove redundant TSFlags parameter in
 RISCVInsertVSETVLI. NFC

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index f350644d4512e6..4c57eecd8465dc 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -908,8 +908,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   void coalesceVSETVLIs(MachineBasicBlock &MBB) const;
 
   VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
-  VSETVLIInfo computeInfoForInstr(const MachineInstr &MI,
-                                  uint64_t TSFlags) const;
+  VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
 };
 
 } // end anonymous namespace
@@ -957,9 +956,10 @@ static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
   return VLEN/SEW;
 }
 
-VSETVLIInfo RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI,
-                                                    uint64_t TSFlags) const {
+VSETVLIInfo
+RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
   VSETVLIInfo InstrInfo;
+  const uint64_t TSFlags = MI.getDesc().TSFlags;
 
   bool TailAgnostic = true;
   bool MaskAgnostic = true;
@@ -1198,13 +1198,12 @@ static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
 // legal for MI, but may not be the state requested by MI.
 void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
                                         const MachineInstr &MI) const {
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-  if (!RISCVII::hasSEWOp(TSFlags))
+  if (!RISCVII::hasSEWOp(MI.getDesc().TSFlags))
     return;
 
   DemandedFields Demanded = getDemanded(MI, ST);
 
-  const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags);
+  const VSETVLIInfo NewInfo = computeInfoForInstr(MI);
   assert(NewInfo.isValid() && !NewInfo.isUnknown());
   if (Info.isValid() && !needVSETVLI(Demanded, NewInfo, Info))
     return;

From b98bce50ac545e63494335141ff4cc3e39bcb4af Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 29 May 2024 13:35:59 -0700
Subject: [PATCH 179/230] [nfc] clang-format llvm/Bitcode/BitcodeWriter.h

Odd indenting.
---
 llvm/include/llvm/Bitcode/BitcodeWriter.h | 234 +++++++++++-----------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index a343f0e0576318..d1f9d57b6db6af 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -29,87 +29,46 @@ class BitstreamWriter;
 class Module;
 class raw_ostream;
 
-  class BitcodeWriter {
-    SmallVectorImpl<char> &Buffer;
-    std::unique_ptr<BitstreamWriter> Stream;
-
-    StringTableBuilder StrtabBuilder{StringTableBuilder::RAW};
-
-    // Owns any strings created by the irsymtab writer until we create the
-    // string table.
-    BumpPtrAllocator Alloc;
-
-    bool WroteStrtab = false, WroteSymtab = false;
-
-    void writeBlob(unsigned Block, unsigned Record, StringRef Blob);
-
-    std::vector<Module *> Mods;
-
-  public:
-    /// Create a BitcodeWriter that writes to Buffer.
-    BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS = nullptr);
-
-    ~BitcodeWriter();
-
-    /// Attempt to write a symbol table to the bitcode file. This must be called
-    /// at most once after all modules have been written.
-    ///
-    /// A reader does not require a symbol table to interpret a bitcode file;
-    /// the symbol table is needed only to improve link-time performance. So
-    /// this function may decide not to write a symbol table. It may so decide
-    /// if, for example, the target is unregistered or the IR is malformed.
-    void writeSymtab();
-
-    /// Write the bitcode file's string table. This must be called exactly once
-    /// after all modules and the optional symbol table have been written.
-    void writeStrtab();
-
-    /// Copy the string table for another module into this bitcode file. This
-    /// should be called after copying the module itself into the bitcode file.
-    void copyStrtab(StringRef Strtab);
-
-    /// Write the specified module to the buffer specified at construction time.
-    ///
-    /// If \c ShouldPreserveUseListOrder, encode the use-list order for each \a
-    /// Value in \c M.  These will be reconstructed exactly when \a M is
-    /// deserialized.
-    ///
-    /// If \c Index is supplied, the bitcode will contain the summary index
-    /// (currently for use in ThinLTO optimization).
-    ///
-    /// \p GenerateHash enables hashing the Module and including the hash in the
-    /// bitcode (currently for use in ThinLTO incremental build).
-    ///
-    /// If \p ModHash is non-null, when GenerateHash is true, the resulting
-    /// hash is written into ModHash. When GenerateHash is false, that value
-    /// is used as the hash instead of computing from the generated bitcode.
-    /// Can be used to produce the same module hash for a minimized bitcode
-    /// used just for the thin link as in the regular full bitcode that will
-    /// be used in the backend.
-    void writeModule(const Module &M, bool ShouldPreserveUseListOrder = false,
-                     const ModuleSummaryIndex *Index = nullptr,
-                     bool GenerateHash = false, ModuleHash *ModHash = nullptr);
-
-    /// Write the specified thin link bitcode file (i.e., the minimized bitcode
-    /// file) to the buffer specified at construction time. The thin link
-    /// bitcode file is used for thin link, and it only contains the necessary
-    /// information for thin link.
-    ///
-    /// ModHash is for use in ThinLTO incremental build, generated while the
-    /// IR bitcode file writing.
-    void writeThinLinkBitcode(const Module &M, const ModuleSummaryIndex &Index,
-                              const ModuleHash &ModHash);
-
-    void writeIndex(
-        const ModuleSummaryIndex *Index,
-        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
-        const GVSummaryPtrSet *DecSummaries);
-  };
-
-  /// Write the specified module to the specified raw output stream.
+class BitcodeWriter {
+  SmallVectorImpl<char> &Buffer;
+  std::unique_ptr<BitstreamWriter> Stream;
+
+  StringTableBuilder StrtabBuilder{StringTableBuilder::RAW};
+
+  // Owns any strings created by the irsymtab writer until we create the
+  // string table.
+  BumpPtrAllocator Alloc;
+
+  bool WroteStrtab = false, WroteSymtab = false;
+
+  void writeBlob(unsigned Block, unsigned Record, StringRef Blob);
+
+  std::vector<Module *> Mods;
+
+public:
+  /// Create a BitcodeWriter that writes to Buffer.
+  BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS = nullptr);
+
+  ~BitcodeWriter();
+
+  /// Attempt to write a symbol table to the bitcode file. This must be called
+  /// at most once after all modules have been written.
   ///
-  /// For streams where it matters, the given stream should be in "binary"
-  /// mode.
+  /// A reader does not require a symbol table to interpret a bitcode file;
+  /// the symbol table is needed only to improve link-time performance. So
+  /// this function may decide not to write a symbol table. It may so decide
+  /// if, for example, the target is unregistered or the IR is malformed.
+  void writeSymtab();
+
+  /// Write the bitcode file's string table. This must be called exactly once
+  /// after all modules and the optional symbol table have been written.
+  void writeStrtab();
+
+  /// Copy the string table for another module into this bitcode file. This
+  /// should be called after copying the module itself into the bitcode file.
+  void copyStrtab(StringRef Strtab);
+
+  /// Write the specified module to the buffer specified at construction time.
   ///
   /// If \c ShouldPreserveUseListOrder, encode the use-list order for each \a
   /// Value in \c M.  These will be reconstructed exactly when \a M is
@@ -127,46 +86,87 @@ class raw_ostream;
   /// Can be used to produce the same module hash for a minimized bitcode
   /// used just for the thin link as in the regular full bitcode that will
   /// be used in the backend.
-  void WriteBitcodeToFile(const Module &M, raw_ostream &Out,
-                          bool ShouldPreserveUseListOrder = false,
-                          const ModuleSummaryIndex *Index = nullptr,
-                          bool GenerateHash = false,
-                          ModuleHash *ModHash = nullptr);
+  void writeModule(const Module &M, bool ShouldPreserveUseListOrder = false,
+                   const ModuleSummaryIndex *Index = nullptr,
+                   bool GenerateHash = false, ModuleHash *ModHash = nullptr);
 
   /// Write the specified thin link bitcode file (i.e., the minimized bitcode
-  /// file) to the given raw output stream, where it will be written in a new
-  /// bitcode block. The thin link bitcode file is used for thin link, and it
-  /// only contains the necessary information for thin link.
+  /// file) to the buffer specified at construction time. The thin link
+  /// bitcode file is used for thin link, and it only contains the necessary
+  /// information for thin link.
   ///
-  /// ModHash is for use in ThinLTO incremental build, generated while the IR
-  /// bitcode file writing.
-  void writeThinLinkBitcodeToFile(const Module &M, raw_ostream &Out,
-                                  const ModuleSummaryIndex &Index,
-                                  const ModuleHash &ModHash);
-
-  /// Write the specified module summary index to the given raw output stream,
-  /// where it will be written in a new bitcode block. This is used when
-  /// writing the combined index file for ThinLTO. When writing a subset of the
-  /// index for a distributed backend, provide the \p ModuleToSummariesForIndex
-  /// map. \p DecSummaries specifies the set of summaries for which the
-  /// corresponding value should be imported as a declaration (prototype).
-  void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out,
-                        const std::map<std::string, GVSummaryMapTy>
-                            *ModuleToSummariesForIndex = nullptr,
-                        const GVSummaryPtrSet *DecSummaries = nullptr);
-
-  /// If EmbedBitcode is set, save a copy of the llvm IR as data in the
-  ///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
-  /// If available, pass the serialized module via the Buf parameter. If not,
-  /// pass an empty (default-initialized) MemoryBufferRef, and the serialization
-  /// will be handled by this API. The same behavior happens if the provided Buf
-  /// is not bitcode (i.e. if it's invalid data or even textual LLVM assembly).
-  /// If EmbedCmdline is set, the command line is also exported in
-  /// the corresponding section (__LLVM,_cmdline / .llvmcmd) - even if CmdArgs
-  /// were empty.
-  void embedBitcodeInModule(Module &M, MemoryBufferRef Buf, bool EmbedBitcode,
-                            bool EmbedCmdline,
-                            const std::vector<uint8_t> &CmdArgs);
+  /// ModHash is for use in ThinLTO incremental build, generated while the
+  /// IR bitcode file writing.
+  void writeThinLinkBitcode(const Module &M, const ModuleSummaryIndex &Index,
+                            const ModuleHash &ModHash);
+
+  void writeIndex(
+      const ModuleSummaryIndex *Index,
+      const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+      const GVSummaryPtrSet *DecSummaries);
+};
+
+/// Write the specified module to the specified raw output stream.
+///
+/// For streams where it matters, the given stream should be in "binary"
+/// mode.
+///
+/// If \c ShouldPreserveUseListOrder, encode the use-list order for each \a
+/// Value in \c M.  These will be reconstructed exactly when \a M is
+/// deserialized.
+///
+/// If \c Index is supplied, the bitcode will contain the summary index
+/// (currently for use in ThinLTO optimization).
+///
+/// \p GenerateHash enables hashing the Module and including the hash in the
+/// bitcode (currently for use in ThinLTO incremental build).
+///
+/// If \p ModHash is non-null, when GenerateHash is true, the resulting
+/// hash is written into ModHash. When GenerateHash is false, that value
+/// is used as the hash instead of computing from the generated bitcode.
+/// Can be used to produce the same module hash for a minimized bitcode
+/// used just for the thin link as in the regular full bitcode that will
+/// be used in the backend.
+void WriteBitcodeToFile(const Module &M, raw_ostream &Out,
+                        bool ShouldPreserveUseListOrder = false,
+                        const ModuleSummaryIndex *Index = nullptr,
+                        bool GenerateHash = false,
+                        ModuleHash *ModHash = nullptr);
+
+/// Write the specified thin link bitcode file (i.e., the minimized bitcode
+/// file) to the given raw output stream, where it will be written in a new
+/// bitcode block. The thin link bitcode file is used for thin link, and it
+/// only contains the necessary information for thin link.
+///
+/// ModHash is for use in ThinLTO incremental build, generated while the IR
+/// bitcode file writing.
+void writeThinLinkBitcodeToFile(const Module &M, raw_ostream &Out,
+                                const ModuleSummaryIndex &Index,
+                                const ModuleHash &ModHash);
+
+/// Write the specified module summary index to the given raw output stream,
+/// where it will be written in a new bitcode block. This is used when
+/// writing the combined index file for ThinLTO. When writing a subset of the
+/// index for a distributed backend, provide the \p ModuleToSummariesForIndex
+/// map. \p DecSummaries specifies the set of summaries for which the
+/// corresponding value should be imported as a declaration (prototype).
+void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out,
+                      const std::map<std::string, GVSummaryMapTy>
+                          *ModuleToSummariesForIndex = nullptr,
+                      const GVSummaryPtrSet *DecSummaries = nullptr);
+
+/// If EmbedBitcode is set, save a copy of the llvm IR as data in the
+///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
+/// If available, pass the serialized module via the Buf parameter. If not,
+/// pass an empty (default-initialized) MemoryBufferRef, and the serialization
+/// will be handled by this API. The same behavior happens if the provided Buf
+/// is not bitcode (i.e. if it's invalid data or even textual LLVM assembly).
+/// If EmbedCmdline is set, the command line is also exported in
+/// the corresponding section (__LLVM,_cmdline / .llvmcmd) - even if CmdArgs
+/// were empty.
+void embedBitcodeInModule(Module &M, MemoryBufferRef Buf, bool EmbedBitcode,
+                          bool EmbedCmdline,
+                          const std::vector<uint8_t> &CmdArgs);
 
 } // end namespace llvm
 

From b74f50a26980233c6bdf3bcacea1473a4a5e0d42 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 29 May 2024 13:36:57 -0700
Subject: [PATCH 180/230] [LAA] Store reference to SymbolicStrides in
 MemoryDepChecker (NFC).

This reduces the need for explicitly passing it through multiple layers
of function calls.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        | 10 ++++++---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 22 +++++++++----------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index c22e1d470f380c..acb3e1406032f5 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -182,8 +182,9 @@ class MemoryDepChecker {
   };
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
+                   const DenseMap<Value *, const SCEV *> &SymbolicStrides,
                    unsigned MaxTargetVectorWidthInBits)
-      : PSE(PSE), InnermostLoop(L),
+      : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
         MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
 
   /// Register the location (instructions are given increasing numbers)
@@ -198,7 +199,6 @@ class MemoryDepChecker {
   ///
   /// Only checks sets with elements in \p CheckDeps.
   bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
-                   const DenseMap<Value *, const SCEV *> &Strides,
                    const DenseMap<Value *, SmallVector<const Value *, 16>>
                        &UnderlyingObjects);
 
@@ -278,6 +278,10 @@ class MemoryDepChecker {
   PredicatedScalarEvolution &PSE;
   const Loop *InnermostLoop;
 
+  /// Reference to map of pointer values to
+  /// their stride symbols, if they have a symbolic stride.
+  const DenseMap<Value *, const SCEV *> &SymbolicStrides;
+
   /// Maps access locations (ptr, read/write) to program order.
   DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
 
@@ -336,7 +340,7 @@ class MemoryDepChecker {
   /// Otherwise, this function returns true signaling a possible dependence.
   Dependence::DepType
   isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-              unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
+              unsigned BIdx,
               const DenseMap<Value *, SmallVector<const Value *, 16>>
                   &UnderlyingObjects);
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bd4c2a35ebf2cb..ab77e35cf6bd50 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2025,7 +2025,7 @@ getDependenceDistanceStrideAndSize(
 
 MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
-    unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
+    unsigned BIdx,
     const DenseMap<Value *, SmallVector<const Value *, 16>>
         &UnderlyingObjects) {
   assert(AIdx < BIdx && "Must pass arguments in program order");
@@ -2033,8 +2033,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
   auto Res = getDependenceDistanceStrideAndSize(
-      A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
-      InnermostLoop);
+      A, InstMap[AIdx], B, InstMap[BIdx], SymbolicStrides, UnderlyingObjects,
+      PSE, InnermostLoop);
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
@@ -2269,7 +2269,6 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
 bool MemoryDepChecker::areDepsSafe(
     DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
-    const DenseMap<Value *, const SCEV *> &Strides,
     const DenseMap<Value *, SmallVector<const Value *, 16>>
         &UnderlyingObjects) {
 
@@ -2314,9 +2313,8 @@ bool MemoryDepChecker::areDepsSafe(
             if (*I1 > *I2)
               std::swap(A, B);
 
-            Dependence::DepType Type =
-                isDependent(*A.first, A.second, *B.first, B.second, Strides,
-                            UnderlyingObjects);
+            Dependence::DepType Type = isDependent(*A.first, A.second, *B.first,
+                                                   B.second, UnderlyingObjects);
             mergeInStatus(Dependence::isSafeForVectorization(Type));
 
             // Gather dependences unless we accumulated MaxDependences
@@ -2674,9 +2672,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   CanVecMem = true;
   if (Accesses.isDependencyCheckNeeded()) {
     LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
-    CanVecMem = DepChecker->areDepsSafe(
-        DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
-        Accesses.getUnderlyingObjects());
+    CanVecMem = DepChecker->areDepsSafe(DependentAccesses,
+                                        Accesses.getDependenciesToCheck(),
+                                        Accesses.getUnderlyingObjects());
 
     if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -3066,8 +3064,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
     if (ScalableWidth.isNonZero())
       MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
   }
-  DepChecker =
-      std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
+  DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
+                                                  MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);

From 6595e7fa1b5588f860aa057aac47c43623169584 Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Wed, 29 May 2024 13:56:37 -0700
Subject: [PATCH 181/230] Revert "[lldb][lldb-dap] Cleanup breakpoint filters."
 (#93739)

Reverts llvm/llvm-project#87550 because it broke `TestDAP*` lldb tests.

https://luci-milo.appspot.com/ui/p/fuchsia/builders/toolchain.ci/clang-linux-x64-rbe/b8746585790559468897/overview
---
 lldb/include/lldb/API/SBDebugger.h    |  2 --
 lldb/include/lldb/Symbol/TypeSystem.h |  1 -
 lldb/source/API/SBDebugger.cpp        |  4 ---
 lldb/source/Symbol/TypeSystem.cpp     | 11 --------
 lldb/tools/lldb-dap/DAP.cpp           | 39 +++++++--------------------
 lldb/tools/lldb-dap/DAP.h             |  4 +--
 lldb/tools/lldb-dap/lldb-dap.cpp      |  6 ++---
 7 files changed, 13 insertions(+), 54 deletions(-)

diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index 84ea9c0f772e16..af19b1faf3bf51 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -57,8 +57,6 @@ class LLDB_API SBDebugger {
 
   static const char *GetBroadcasterClass();
 
-  static bool SupportsLanguage(lldb::LanguageType language);
-
   lldb::SBBroadcaster GetBroadcaster();
 
   /// Get progress data from a SBEvent whose type is eBroadcastBitProgress.
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 7d48f9b316138c..b4025c173a1861 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -209,7 +209,6 @@ class TypeSystem : public PluginInterface,
   // TypeSystems can support more than one language
   virtual bool SupportsLanguage(lldb::LanguageType language) = 0;
 
-  static bool SupportsLanguageStatic(lldb::LanguageType language);
   // Type Completion
 
   virtual bool GetCompleteType(lldb::opaque_compiler_type_t type) = 0;
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 29da7d33dd80b8..7ef0d6efd4aaa5 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -1742,7 +1742,3 @@ bool SBDebugger::InterruptRequested()   {
     return m_opaque_sp->InterruptRequested();
   return false;
 }
-
-bool SBDebugger::SupportsLanguage(lldb::LanguageType language) {
-  return TypeSystem::SupportsLanguageStatic(language);
-}
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 5d56d9b1829dac..4956f10a0b0a73 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -335,14 +335,3 @@ TypeSystemMap::GetTypeSystemForLanguage(lldb::LanguageType language,
   }
   return GetTypeSystemForLanguage(language);
 }
-
-bool TypeSystem::SupportsLanguageStatic(lldb::LanguageType language) {
-  if (language == eLanguageTypeUnknown)
-    return false;
-
-  LanguageSet languages =
-      PluginManager::GetAllTypeSystemSupportedLanguagesForTypes();
-  if (languages.Empty())
-    return false;
-  return languages[language];
-}
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 807d27c2c869d9..d419f821999e6c 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -32,7 +32,14 @@ namespace lldb_dap {
 DAP g_dap;
 
 DAP::DAP()
-    : broadcaster("lldb-dap"), exception_breakpoints(),
+    : broadcaster("lldb-dap"),
+      exception_breakpoints(
+          {{"cpp_catch", "C++ Catch", lldb::eLanguageTypeC_plus_plus},
+           {"cpp_throw", "C++ Throw", lldb::eLanguageTypeC_plus_plus},
+           {"objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC},
+           {"objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC},
+           {"swift_catch", "Swift Catch", lldb::eLanguageTypeSwift},
+           {"swift_throw", "Swift Throw", lldb::eLanguageTypeSwift}}),
       focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false),
       enable_auto_variable_summaries(false),
       enable_synthetic_child_debugging(false),
@@ -58,32 +65,8 @@ DAP::DAP()
 
 DAP::~DAP() = default;
 
-void DAP::PopulateExceptionBreakpoints() {
-  exception_breakpoints = {};
-  if (debugger.SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
-    exception_breakpoints->emplace_back("cpp_catch", "C++ Catch",
-                                        lldb::eLanguageTypeC_plus_plus);
-    exception_breakpoints->emplace_back("cpp_throw", "C++ Throw",
-                                        lldb::eLanguageTypeC_plus_plus);
-  }
-  if (debugger.SupportsLanguage(lldb::eLanguageTypeObjC)) {
-    exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch",
-                                        lldb::eLanguageTypeObjC);
-    exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw",
-                                        lldb::eLanguageTypeObjC);
-  }
-  if (debugger.SupportsLanguage(lldb::eLanguageTypeSwift)) {
-    exception_breakpoints->emplace_back("swift_catch", "Swift Catch",
-                                        lldb::eLanguageTypeSwift);
-    exception_breakpoints->emplace_back("swift_throw", "Swift Throw",
-                                        lldb::eLanguageTypeSwift);
-  }
-}
-
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
-  assert(exception_breakpoints.has_value() &&
-         "PopulateExceptionBreakpoints must be called first");
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.filter == filter)
       return &bp;
   }
@@ -91,9 +74,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  assert(exception_breakpoints.has_value() &&
-         "PopulateExceptionBreakpoints must be called first");
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.bp.GetID() == bp_id)
       return &bp;
   }
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index d114b886a15970..a88ee3e1dec6bc 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -156,7 +156,7 @@ struct DAP {
   std::unique_ptr<std::ofstream> log;
   llvm::StringMap<SourceBreakpointMap> source_breakpoints;
   FunctionBreakpointMap function_breakpoints;
-  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
+  std::vector<ExceptionBreakpoint> exception_breakpoints;
   std::vector<std::string> init_commands;
   std::vector<std::string> pre_run_commands;
   std::vector<std::string> post_run_commands;
@@ -228,8 +228,6 @@ struct DAP {
 
   llvm::json::Value CreateTopLevelScopes();
 
-  void PopulateExceptionBreakpoints();
-
   /// \return
   ///   Attempt to determine if an expression is a variable expression or
   ///   lldb command using a hueristic based on the first term of the
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 470c9f84c6a203..7746afb6cbbf38 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -16,7 +16,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <optional>
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(_WIN32)
@@ -1587,7 +1586,6 @@ void request_initialize(const llvm::json::Object &request) {
   bool source_init_file = GetBoolean(arguments, "sourceInitFile", true);
 
   g_dap.debugger = lldb::SBDebugger::Create(source_init_file, log_cb, nullptr);
-  g_dap.PopulateExceptionBreakpoints();
   auto cmd = g_dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) {
@@ -1623,7 +1621,7 @@ void request_initialize(const llvm::json::Object &request) {
   body.try_emplace("supportsEvaluateForHovers", true);
   // Available filters or options for the setExceptionBreakpoints request.
   llvm::json::Array filters;
-  for (const auto &exc_bp : *g_dap.exception_breakpoints) {
+  for (const auto &exc_bp : g_dap.exception_breakpoints) {
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   }
   body.try_emplace("exceptionBreakpointFilters", std::move(filters));
@@ -2478,7 +2476,7 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
   std::set<std::string> unset_filters;
-  for (const auto &bp : *g_dap.exception_breakpoints)
+  for (const auto &bp : g_dap.exception_breakpoints)
     unset_filters.insert(bp.filter);
 
   for (const auto &value : *filters) {

From 495bc3cf23dedadd6d633cf9600b7c1f8ac87bb4 Mon Sep 17 00:00:00 2001
From: "S. Bharadwaj Yadavalli" <Bharadwaj.Yadavalli@microsoft.com>
Date: Wed, 29 May 2024 16:58:13 -0400
Subject: [PATCH 182/230] [DirectX][DXIL] Design document for TableGen Spec of
 DXIL Operations (#85170)

Add an initial design document for TableGen specification of DXIL Operations.
---
 llvm/docs/DirectX/DXILOpTableGenDesign.rst | 160 +++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 llvm/docs/DirectX/DXILOpTableGenDesign.rst

diff --git a/llvm/docs/DirectX/DXILOpTableGenDesign.rst b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
new file mode 100644
index 00000000000000..2b1e0901b50614
--- /dev/null
+++ b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
@@ -0,0 +1,160 @@
+==============================================================
+Specification of DXIL Operations using TableGen Representation
+==============================================================
+.. contents::
+   :local:
+
+.. toctree
+   :hidden
+
+Introduction
+============
+
+`DirectXShaderCompiler <https://github.com/microsoft/DirectXShaderCompiler>`_
+encapsulates, among other information, various DXIL Operations in
+`hctdb.py <https://github.com/microsoft/DirectXShaderCompiler/blob/main/utils/hct/hctdb.py>`_.
+DXIL Operations are represented in one of the following `two ways
+<https://github.com/microsoft/DirectXShaderCompiler/blob/130877392c263888ef06bab768856d3dab1f1c9a/docs/DXIL.rst#L1978>`_:
+
+#. Using LLVM instructions.
+#. Using LLVM External functions. These are represented in LLVM IR as follows:
+   * "Standard" LLVM intrinsics (e.g., ``llvm.sin.*``) and
+   * HLSL intrinsics (defined as LLVM intrinsics in ``llvm/include/llvm/IR/IntrinsicsDirectX.td``, e.g., ``llvm.dx.*``)
+
+   These are  collectively referred to as `LLVM Intrinsics` in this note.
+
+Following is the complete list of properties of DXIL Ops with the corresponding field name
+as used in ``hctdb.py``. A DXIL Op is represented by a set of associated properties. These
+are categorized into two groups - viz., those that are (1) consumed in DXIL backend passes
+and (2) consumed in other usage scenarios such as validation, DXIL reader, etc.
+
+A. Properties consumed in DXIL backend passes
+
+   1. Name of operation (``dxil_op``)
+   2. The generic or HLSL-specific intrinsic that maps to the operation (``llvm_name``).
+   3. Unique Integer ID (``dxil_opid``)
+   4. Operation Class signifying the name and function signature of the operation (``dxil_class``).
+      This string is an integral part of the DXIL Op function name and is constructed in
+      the format ``dx.op.<class-name>.<overload-type>``. The DXIL validator checks for any
+      deviation from this for each of the DXIL Op call.
+   5. List of valid overload types for the operation (``oload_types``).
+   6. Required DXIL Version with support for the operation.
+   7. A string that documents the operation (``doc``) - This is not strictly necessary but is included
+      for readability and documentation of the operation.
+
+B. Properties consumed in other usage scenarios
+
+   1. Required minimum Shader Model (``shader_model``).
+   2. Minimum shader model required with translation by linker (``shader_model_translated``)
+   3.  List of shader stages applicable to (``shader_stages``), empty for all.
+   4.  Memory access attributes of the operation (``fn_attr``).
+   5.  Boolean attributes of operation to indicate if it
+
+       * is some kind of a derivative (``is_derivative``)
+       * requires gradient calculation (``is_gradient``)
+       * is a sampler feedback (``is_feedback``)
+       * requires in-wave, cross-lane functionality (``is_wave``)
+       * requires that all of its inputs are uniform across the wave (``requires_uniform_inputs``).
+       * is a barrier operation (``is_barrier``).
+
+Motivation
+==========
+
+DXIL backend passes depend on various properties of DXIL Operations. For example, ``DXILLowering``
+pass will need information such as the DXIL operation an LLVM intrinsic is to be lowered to,
+along with valid overload and parameter types etc. The TableGen file -
+``llvm/lib/Target/DirectX/DXIL.td`` - is used to represent DXIL Operations
+by specifying their properties listed above. ``DXIL.td`` is designed to be the single source
+of reference of DXIL Operations for DXIL backend implementation in ``llvm-project`` repo -
+analogous to ``hctdb.py`` for ``DirectXShadeCompiler`` repo. It needs to have a rich
+representation capabilities that TableGen backends (such as ``DXILEmitter``) can rely on.
+Additionally, the DXIL Op specification should be easy to read and comprehend.
+
+This note focuses on specification of the set of properties consumed by DXIL backend
+passes identified above in category A. Any of the properties from category B are expected to be
+included as deemed necessary during implementation.
+
+Design
+======
+
+1. Each DXIL Operation is represented as a TableGen record. The name of each of the records
+   signifies operation name.
+2. The LLVM Intrinsic that maps to the operation is represented using ``Intrinsic::*``.
+3. The unique operation id is represented by an integer.
+4. DXIL Operation Class is represented as follows
+
+   .. code-block::
+
+        // Abstraction of DXIL Operation class.
+        // It encapsulates an associated function signature viz.,
+        // returnTy(param1Ty, param2Ty, ...) represented as a list of LLVMTypes.
+        // DXIL Ops that belong to a DXILOpClass record the signature of that DXILOpClass
+
+        class DXILOpClass<list<LLVMType> OpSig> {
+          list<LLVMType> OpSignature = OpSig;
+        }
+
+   Concrete operation classes, such as ``unary`` are defined inheriting from ``DXILOpClass``.
+5. Valid overload types are represented as a list of ``LLVMType``.
+6. Concrete records of DXIL versions and are defined by inheriting from the class
+
+   .. code-block::
+
+        // Abstract class to represent major and minor version values
+        class Version<int major, int minor> {
+          int Major = major;
+          int Minor = minor;
+        }
+
+7. A documentation string for the operation.
+
+
+A DXIL Operation is represented by the following TableGen class by encapsulating the various
+TableGen representations of its properties described above.
+
+.. code-block::
+
+  // Abstraction DXIL Operation
+  class DXILOpPropertiesBase {
+    int OpCode = 0;                           // Opcode of DXIL Operation
+    DXILOpClass OpClass = UnknownOpClass;     // Class of DXIL Operation.
+    Intrinsic LLVMIntrinsic = ?;              // LLVM Intrinsic DXIL Operation maps to
+    list<LLVMType> OpOverloadTypes = ?; // Valid overload type
+                                              // of DXIL Operation
+    Version DXILVer = ?;                      // Min DXIL version
+    string Doc = "";                          // A short description of the operation
+  }
+
+
+The following convenience class, definitions of ``unary`` and ``DXVer1_0`` are used to
+illustrate the definitions of ``Sin`` and ``Cos`` operations:
+
+  .. code-block::
+
+      class DXILOpProperties<int opCode,
+                    Intrinsic intrinsic,
+                    list<LLVMType> overloadTypes,
+                    string doc> : DXILOpPropertiesBase {
+        int OpCode = opCode;
+        Intrinsic LLVMIntrinsic = intrinsic;
+        list<LLVMType> OpOverloadTypes = overloadTypes;
+        string Doc = doc;
+      }
+
+      def unary : DXILOpClass<[llvm_any_ty, LLVMMatchType<0>]>;
+      def DXVer1_0 : Version<1, 0>;
+
+      let OpClass = unary, DXILVer = DXVer1_0 in {
+        def Cos  : DXILOpProperties<12, int_cos, [llvm_half_ty, llvm_float_ty],
+                                   "Returns cosine(theta) for theta in radians.">;
+        def Sin  : DXILOpProperties<13, int_sin, [llvm_half_ty, llvm_float_ty],
+                                   "Returns sine(theta) for theta in radians.">;
+      }
+
+Summary
+=======
+
+This note sketches the design of a readable and maintainable TableGen specification of
+DXIL Ops in ``DXIL.td`` intended to serve as a single source of reference for TableGen
+backends (such as ``DXILEmitter``) that generates C++ representations used in DXIL
+backend passes.

From bfabc958c7c0d7ddc15f23383d9da836e8c6093f Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 29 May 2024 14:09:28 -0700
Subject: [PATCH 183/230] [DebugInfo] Add flag to only emit referenced member
 functions (#87018)

Complete C++ type information can be quite expensive - and there's
limited value in representing every member function, even those that
can't be called (we don't do similarly for every non-member function
anyway). So add a flag to opt out of this behavior for experimenting
with this more terse behavior.

I think Sony already does this by default, so perhaps with a change to
the defaults, Sony can migrate to this rather than a downstream patch.

This breaks current debuggers in some expected ways - but those
breakages are visible without this feature too. Consider member function
template instantiations - they can't be consistently enumerated in every
translation unit:

a.h:
```
struct t1 {
  template <int i>
  static int f1() {
    return i;
  }
};
namespace ns {
template <int i>
int f1() {
  return i;
}
}  // namespace ns
```
a.cpp:
```
void f1() {
  t1::f1<0>();
  ns::f1<0>();
}
```
b.cpp:
```
void f1();
int main() {
  f1();
  t1::f1<1>();
  ns::f1<1>();
}
```
```
(gdb) p ns::f1<0>()
$1 = 0
(gdb) p ns::f1<1>()
$2 = 1
(gdb) p t1::f1<0>()
Couldn't find method t1::f1<0>
(gdb) p t1::f1<1>()
$3 = 1
(gdb) s
f1 () at a.cpp:3
3         t1::f1<0>();
(gdb) p t1::f1<0>()
$4 = 0
(gdb) p t1::f1<1>()
Couldn't find method t1::f1<1>
(gdb)
```

(other similar non-canonical features are implicit special members
(copy/move ctor/assignment operator, default ctor) and nested types (eg:
pimpl idiom, where the nested type is declared-but-not-defined in one
TU, and defined in another TU))

lldb can't parse the template expressions above, so I'm not sure how to
test it there, but I'd guess it has similar problems. (

https://stackoverflow.com/questions/64602475/how-to-print-value-returned-by-template-member-function-in-gdb-lldb-debugging
so... I guess that's just totally not supported in lldb, how
unfortunate. And implicit special members are instantiated implicitly by
lldb, so missing those doesn't tickle the same issue)

Some very rudimentary numbers for a clang debug build:
.debug_info section size:
-g: 476MiB
-g -fdebug-types-section: 357MiB
-g -gomit-unreferenced-members: 340MiB

Though it also means a major reduction in .debug_str size,
-fdebug-types-section doesn't reduce string usage (so the first two
examples have the same .debug_str size, 247MiB), down to 175MiB.

So for total clang binary size (I don't have a quick "debug section size
reduction" on-hand): 1.45 (no type units) GiB -> 1.34 -> 1.22, so it
saves about 120MiB of binary size.

Also open to any riffing on the flag name for sure.

@probinson - would this be an accurate upstreaming of your internal
handling/would you use this functionality? If it wouldn't be useful to
you, it's maybe not worth adding upstream yet - not sure we'll use it at
Google, but if it was useful to you folks and meant other folks could
test with it it seemed maybe useful.

Original Differential Revision: https://reviews.llvm.org/D152017
---
 clang/include/clang/Basic/DebugOptions.def        |  2 ++
 clang/include/clang/Driver/Options.td             |  4 ++++
 clang/lib/CodeGen/CGDebugInfo.cpp                 |  2 +-
 clang/lib/Driver/ToolChains/Clang.cpp             | 15 +++++++++++++++
 .../CodeGenCXX/debug-info-incomplete-types.cpp    | 12 ++++++++++++
 clang/test/Driver/debug-options.c                 |  8 ++++++++
 6 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenCXX/debug-info-incomplete-types.cpp

diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index b94f6aef9ac60b..bc96d5dfdf890b 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -68,6 +68,8 @@ BENIGN_DEBUGOPT(NoInlineLineTables, 1, 0) ///< Whether debug info should contain
                                           ///< inline line tables.
 
 DEBUGOPT(DebugStrictDwarf, 1, 1) ///< Whether or not to use strict DWARF info.
+DEBUGOPT(DebugOmitUnreferencedMethods, 1, 0) ///< Omit unreferenced member
+					     ///< functions in type debug info.
 
 /// Control the Assignment Tracking debug info feature.
 BENIGN_ENUM_DEBUGOPT(AssignmentTrackingMode, AssignmentTrackingOpts, 2,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4119e69c85540e..f64d7c60783e9d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4345,6 +4345,10 @@ defm strict_dwarf : BoolOption<"g", "strict-dwarf",
           "the specified version, avoiding features from later versions.">,
   NegFlag<SetFalse>, BothFlags<[], [ClangOption, CLOption, DXCOption]>>,
   Group<g_flags_Group>;
+defm omit_unreferenced_methods : BoolGOption<"omit-unreferenced-methods",
+  CodeGenOpts<"DebugOmitUnreferencedMethods">, DefaultFalse,
+  NegFlag<SetFalse>,
+  PosFlag<SetTrue, [], [CC1Option]>, BothFlags<[], [ClangOption, CLOption, DXCOption]>>;
 defm column_info : BoolOption<"g", "column-info",
   CodeGenOpts<"DebugColumnInfo">, DefaultTrue,
   NegFlag<SetFalse, [], [ClangOption, CC1Option]>,
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index fac278f0e20a43..1713f7065e7a20 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2836,7 +2836,7 @@ CGDebugInfo::CreateTypeDefinition(const RecordType *Ty) {
 
   // Collect data fields (including static variables and any initializers).
   CollectRecordFields(RD, DefUnit, EltTys, FwdDecl);
-  if (CXXDecl)
+  if (CXXDecl && !CGM.getCodeGenOpts().DebugOmitUnreferencedMethods)
     CollectCXXMemberFunctions(CXXDecl, DefUnit, EltTys, FwdDecl);
 
   LexicalBlockStack.pop_back();
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 97e451cfe2acb4..4e1c52462e5842 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -45,6 +45,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CodeGen.h"
@@ -4642,6 +4643,7 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
   Args.addOptInFlag(CmdArgs, options::OPT_fforce_dwarf_frame,
                     options::OPT_fno_force_dwarf_frame);
 
+  bool EnableTypeUnits = false;
   if (Args.hasFlag(options::OPT_fdebug_types_section,
                    options::OPT_fno_debug_types_section, false)) {
     if (!(T.isOSBinFormatELF() || T.isOSBinFormatWasm())) {
@@ -4652,11 +4654,24 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
     } else if (checkDebugInfoOption(
                    Args.getLastArg(options::OPT_fdebug_types_section), Args, D,
                    TC)) {
+      EnableTypeUnits = true;
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-generate-type-units");
     }
   }
 
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_gomit_unreferenced_methods,
+                          options::OPT_gno_omit_unreferenced_methods))
+    (void)checkDebugInfoOption(A, Args, D, TC);
+  if (Args.hasFlag(options::OPT_gomit_unreferenced_methods,
+                   options::OPT_gno_omit_unreferenced_methods, false) &&
+      (DebugInfoKind == llvm::codegenoptions::DebugInfoConstructor ||
+       DebugInfoKind == llvm::codegenoptions::LimitedDebugInfo) &&
+      !EnableTypeUnits) {
+    CmdArgs.push_back("-gomit-unreferenced-methods");
+  }
+
   // To avoid join/split of directory+filename, the integrated assembler prefers
   // the directory form of .file on all DWARF versions. GNU as doesn't allow the
   // form before DWARF v5.
diff --git a/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp b/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp
new file mode 100644
index 00000000000000..0bf59233b4e2eb
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -debug-info-kind=limited -gomit-unreferenced-methods %s -emit-llvm -o - | FileCheck %s
+
+struct t1 {
+  void f1();
+  void f2();
+};
+
+void t1::f1() { }
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1"
+// CHECK-SAME: elements: [[ELEMENTS:![0-9]+]]
+// CHECK: [[ELEMENTS]] = !{}
diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c
index 7d061410a229f0..b09238d7b6bb66 100644
--- a/clang/test/Driver/debug-options.c
+++ b/clang/test/Driver/debug-options.c
@@ -242,6 +242,11 @@
 // RUN: %clang -### -c %s 2>&1 | FileCheck -check-prefix=NORNGBSE %s
 // RUN: %clang -### -c -fdebug-ranges-base-address -fno-debug-ranges-base-address %s 2>&1 | FileCheck -check-prefix=NORNGBSE %s
 //
+// RUN: %clang -### -c -gomit-unreferenced-methods %s 2>&1 | FileCheck -check-prefix=INCTYPES %s
+// RUN: %clang -### -c %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
+// RUN: %clang -### -c -gomit-unreferenced-methods -fdebug-types-section %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
+// RUN: %clang -### -c -gomit-unreferenced-methods -fstandalone-debug %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
+//
 // RUN: %clang -### -c -glldb %s 2>&1 | FileCheck -check-prefix=NOPUB %s
 // RUN: %clang -### -c -glldb -gno-pubnames %s 2>&1 | FileCheck -check-prefix=NOPUB %s
 //
@@ -381,6 +386,9 @@
 // RNGBSE: -fdebug-ranges-base-address
 // NORNGBSE-NOT: -fdebug-ranges-base-address
 //
+// INCTYPES: -gomit-unreferenced-methods
+// NOINCTYPES-NOT: -gomit-unreferenced-methods
+//
 // GARANGE-DAG: -generate-arange-section
 //
 // FDTS: "-mllvm" "-generate-type-units"

From 1880a7bf18f8bf6497eddeda5cea49b507413f3a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 29 May 2024 14:18:03 -0700
Subject: [PATCH 184/230] [LAA] Move getDependenceDistanceStrideAndSize to
 MemoryDepChecker (NFC).

This avoids unnecessarily passing a number of parameters, and avoids
needing to add extra parameters in the future.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        | 29 +++++++++++++
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 43 +++++--------------
 2 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index acb3e1406032f5..69afe7079aa163 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -355,6 +355,35 @@ class MemoryDepChecker {
   /// either PossiblySafeWithRtChecks or Unsafe and from
   /// PossiblySafeWithRtChecks to Unsafe.
   void mergeInStatus(VectorizationSafetyStatus S);
+
+  struct DepDistanceStrideAndSizeInfo {
+    const SCEV *Dist;
+    uint64_t StrideA;
+    uint64_t StrideB;
+    uint64_t TypeByteSize;
+    bool AIsWrite;
+    bool BIsWrite;
+
+    DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
+                                 uint64_t StrideB, uint64_t TypeByteSize,
+                                 bool AIsWrite, bool BIsWrite)
+        : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
+          TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
+  };
+
+  /// Get the dependence distance, strides, type size and whether it is a write
+  /// for the dependence between A and B. Returns a DepType, if we can prove
+  /// there's no dependence or the analysis fails. Outlined to lambda to limit
+  /// he scope of various temporary variables, like A/BPtr, StrideA/BPtr and
+  /// others. Returns either the dependence result, if it could already be
+  /// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
+  /// BIsWrite).
+  std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
+  getDependenceDistanceStrideAndSize(
+      const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B,
+      Instruction *BInst,
+      const DenseMap<Value *, SmallVector<const Value *, 16>>
+          &UnderlyingObjects);
 };
 
 class RuntimePointerChecking;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index ab77e35cf6bd50..13005cb8335d1a 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1903,37 +1903,13 @@ isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
   });
 }
 
-namespace {
-struct DepDistanceStrideAndSizeInfo {
-  const SCEV *Dist;
-  uint64_t StrideA;
-  uint64_t StrideB;
-  uint64_t TypeByteSize;
-  bool AIsWrite;
-  bool BIsWrite;
-
-  DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
-                               uint64_t StrideB, uint64_t TypeByteSize,
-                               bool AIsWrite, bool BIsWrite)
-      : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
-        TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
-};
-} // namespace
-
-// Get the dependence distance, strides, type size and whether it is a write for
-// the dependence between A and B. Returns a DepType, if we can prove there's
-// no dependence or the analysis fails. Outlined to lambda to limit he scope
-// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
-// Returns either the dependence result, if it could already be determined, or a
-// struct containing (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
-static std::variant<MemoryDepChecker::Dependence::DepType,
-                    DepDistanceStrideAndSizeInfo>
-getDependenceDistanceStrideAndSize(
+std::variant<MemoryDepChecker::Dependence::DepType,
+             MemoryDepChecker::DepDistanceStrideAndSizeInfo>
+MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
     const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
-    const DenseMap<Value *, const SCEV *> &Strides,
-    const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
-    PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
+    const DenseMap<Value *, SmallVector<const Value *, 16>>
+        &UnderlyingObjects) {
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
   auto &SE = *PSE.getSE();
   auto [APtr, AIsWrite] = A;
@@ -1952,9 +1928,11 @@ getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::Unknown;
 
   int64_t StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
+      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true)
+          .value_or(0);
   int64_t StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
+      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true)
+          .value_or(0);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2033,8 +2011,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
   auto Res = getDependenceDistanceStrideAndSize(
-      A, InstMap[AIdx], B, InstMap[BIdx], SymbolicStrides, UnderlyingObjects,
-      PSE, InnermostLoop);
+      A, InstMap[AIdx], B, InstMap[BIdx], UnderlyingObjects);
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 

From c6c08eee37bada190bd1aa4593c88a5e2c8cdaac Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 29 May 2024 14:21:06 -0700
Subject: [PATCH 185/230] [lldb] Remove setupterm workaround on macOS (#93714)

Remove setupterm workaround on macOS which caused an issues after the
removal of the terminfo dependency. There's a comment that explains why
the workaround is present, but neither Jim nor I were able to reproduce
the issue by setting TERM to vt100.
---
 lldb/source/Host/common/Editline.cpp | 43 ----------------------------
 1 file changed, 43 deletions(-)

diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index ed61aecc23b9b0..561ec228cdb23f 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -31,20 +31,6 @@
 using namespace lldb_private;
 using namespace lldb_private::line_editor;
 
-// Workaround for what looks like an OS X-specific issue, but other platforms
-// may benefit from something similar if issues arise.  The libedit library
-// doesn't explicitly initialize the curses termcap library, which it gets away
-// with until TERM is set to VT100 where it stumbles over an implementation
-// assumption that may not exist on other platforms.  The setupterm() function
-// would normally require headers that don't work gracefully in this context,
-// so the function declaration has been hoisted here.
-#if defined(__APPLE__)
-extern "C" {
-int setupterm(char *term, int fildes, int *errret);
-}
-#define USE_SETUPTERM_WORKAROUND
-#endif
-
 // Editline uses careful cursor management to achieve the illusion of editing a
 // multi-line block of text with a single line editor.  Preserving this
 // illusion requires fairly careful management of cursor state.  Read and
@@ -1402,35 +1388,6 @@ Editline::Editline(const char *editline_name, FILE *input_file,
   // Get a shared history instance
   m_editor_name = (editline_name == nullptr) ? "lldb-tmp" : editline_name;
   m_history_sp = EditlineHistory::GetHistory(m_editor_name);
-
-#ifdef USE_SETUPTERM_WORKAROUND
-  if (m_output_file) {
-    const int term_fd = fileno(m_output_file);
-    if (term_fd != -1) {
-      static std::recursive_mutex *g_init_terminal_fds_mutex_ptr = nullptr;
-      static std::set<int> *g_init_terminal_fds_ptr = nullptr;
-      static llvm::once_flag g_once_flag;
-      llvm::call_once(g_once_flag, [&]() {
-        g_init_terminal_fds_mutex_ptr =
-            new std::recursive_mutex(); // NOTE: Leak to avoid C++ destructor
-                                        // chain issues
-        g_init_terminal_fds_ptr = new std::set<int>(); // NOTE: Leak to avoid
-                                                       // C++ destructor chain
-                                                       // issues
-      });
-
-      // We must make sure to initialize the terminal a given file descriptor
-      // only once. If we do this multiple times, we start leaking memory.
-      std::lock_guard<std::recursive_mutex> guard(
-          *g_init_terminal_fds_mutex_ptr);
-      if (g_init_terminal_fds_ptr->find(term_fd) ==
-          g_init_terminal_fds_ptr->end()) {
-        g_init_terminal_fds_ptr->insert(term_fd);
-        setupterm((char *)0, term_fd, (int *)0);
-      }
-    }
-  }
-#endif
 }
 
 Editline::~Editline() {

From 058d4295939998923c78df80d0ea3b82ed899aa1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 29 May 2024 14:46:07 -0700
Subject: [PATCH 186/230] [Analysis] Fix a build error regarding std::variant

This patch fixes:

  llvm/include/llvm/Analysis/LoopAccessAnalysis.h:381:8: error: no
  template named 'variant' in namespace 'std'
---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 69afe7079aa163..b9f385f4c4b8fa 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include <optional>
+#include <variant>
 
 namespace llvm {
 

From 34b14cc4f88b5e3d757f2ab20c19387178056567 Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Wed, 29 May 2024 14:53:29 -0700
Subject: [PATCH 187/230] [lld][ELF] Suppress `--orphan-handling=error/warn`
 without `SECTIONS` (#93630)

Without a linker script, `--orphan-handling=error` or `=warn` reports
all input sections, including even well-known sections like `.text`,
`.bss`, `.dynamic`, or `.symtab`. However, in this case, no sections
should be considered orphans because they all are placed with the same
default rules. This patch suppresses errors/warnings for placing orphan
sections if no linker script with the `SECTIONS` command is provided.

The proposed behavior matches GNU gold. GNU ld in the same scenario only
reports sections that are not in its default linker script, thus, it
avoids complaining about `.text` and similar.
---
 lld/ELF/LinkerScript.cpp                  | 3 ++-
 lld/test/ELF/linkerscript/orphan-report.s | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 3ba59c112b8a8b..94a39034d629fb 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -936,7 +936,8 @@ void LinkerScript::addOrphanSections() {
 
 void LinkerScript::diagnoseOrphanHandling() const {
   llvm::TimeTraceScope timeScope("Diagnose orphan sections");
-  if (config->orphanHandling == OrphanHandlingPolicy::Place)
+  if (config->orphanHandling == OrphanHandlingPolicy::Place ||
+      !hasSectionsCommand)
     return;
   for (const InputSectionBase *sec : orphanSections) {
     // .relro_padding is inserted before DATA_SEGMENT_RELRO_END, if present,
diff --git a/lld/test/ELF/linkerscript/orphan-report.s b/lld/test/ELF/linkerscript/orphan-report.s
index 3dca23267ec648..eedcc4eff81683 100644
--- a/lld/test/ELF/linkerscript/orphan-report.s
+++ b/lld/test/ELF/linkerscript/orphan-report.s
@@ -11,6 +11,13 @@
 # RUN: ld.lld -shared --orphan-handling=place -o %t.out --script %t.script \
 # RUN:   %t.o 2>&1 -verbose  -error-limit=0 | FileCheck %s --check-prefix=DEFAULT
 
+## Check --orphan-handling=error or =warn do not report errors if no linker
+## script is used.
+# RUN: ld.lld -shared -orphan-handling=error -o /dev/null %t.o 2>&1 | count 0
+# RUN: ld.lld -shared -orphan-handling=warn -o /dev/null %t.o 2>&1 | count 0
+# RUN: ld.lld -r -orphan-handling=error -o /dev/null %t.o 2>&1 | count 0
+# RUN: ld.lld -r -orphan-handling=warn -o /dev/null %t.o 2>&1 | count 0
+
 ## Check --orphan-handling=error reports errors about orphans.
 # RUN: not ld.lld --orphan-handling=error -o /dev/null -T %t.script \
 # RUN:   %t.o 2>&1 | FileCheck %s --check-prefixes=COMMON,SYMTAB

From 3bdc90e3ff4c9a18caeb3e6ad40fa5d15bbf9d5e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 29 May 2024 14:56:43 -0700
Subject: [PATCH 188/230] [ELF] adjustOutputSections: update sortRank. NFC

... as flags have changed. This allows us to revisit the
`osd->osec.hasInputSections` condition in `getRankProximity` (originally
introduced as `Sec->Live` in https://reviews.llvm.org/D61197).
---
 lld/ELF/LinkerScript.cpp | 4 +++-
 lld/ELF/Writer.cpp       | 2 +-
 lld/ELF/Writer.h         | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 94a39034d629fb..5062124ae6767e 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -1249,9 +1249,11 @@ void LinkerScript::adjustOutputSections() {
 
     // We do not want to keep any special flags for output section
     // in case it is empty.
-    if (isEmpty)
+    if (isEmpty) {
       sec->flags =
           flags & ((sec->nonAlloc ? 0 : (uint64_t)SHF_ALLOC) | SHF_WRITE);
+      sec->sortRank = getSectionRank(*sec);
+    }
 
     // The code below may remove empty output sections. We should save the
     // specified program headers (if exist) and propagate them to subsequent
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index fe2e1900520a49..d2cc6d8ff5f2cb 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -630,7 +630,7 @@ enum RankFlags {
   RF_BSS = 1 << 7,
 };
 
-static unsigned getSectionRank(OutputSection &osec) {
+unsigned elf::getSectionRank(OutputSection &osec) {
   unsigned rank = osec.partition * RF_PARTITION;
 
   // We want to put section specified by -T option first, so we
diff --git a/lld/ELF/Writer.h b/lld/ELF/Writer.h
index 7aa06dbcb131ad..e3787987aca75c 100644
--- a/lld/ELF/Writer.h
+++ b/lld/ELF/Writer.h
@@ -46,6 +46,7 @@ struct PhdrEntry {
 
 void addReservedSymbols();
 bool includeInSymtab(const Symbol &b);
+unsigned getSectionRank(OutputSection &osec);
 
 template <class ELFT> uint32_t calcMipsEFlags();
 

From aeccfee348c717165541d8d895b9b0cdfe31415c Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Wed, 29 May 2024 18:04:11 -0400
Subject: [PATCH 189/230] Add option to generate additional debug info for
 expression dereferencing pointer to pointers. (#81545)

Such expression does not correspond to a variable in the source code
thus does not have a debug location. When the user collects perf data on
the program, if the intermediate memory load instruction is sampled, it
could not be attributed to any variable/class member, which causes the
sampling results to be under-counted.
This patch adds an option `-fdebug_info_for_pointer_type` to generate a
psuedo variable and its debug info for intermediate expression with
pointer dereferencing, so that perf data collected on the instruction of
that expression can be attributed to the correct class member.

This is a prototype so comments are needed.
---
 clang/lib/CodeGen/CGDebugInfo.cpp             |  84 ++++++++++++
 clang/lib/CodeGen/CGDebugInfo.h               |   6 +
 clang/lib/CodeGen/CGExprScalar.cpp            |  21 ++-
 .../test/CodeGenCXX/debug-info-ptr-to-ptr.cpp | 120 ++++++++++++++++++
 4 files changed, 230 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 1713f7065e7a20..5f6f911c7a6d69 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -5737,6 +5737,90 @@ void CGDebugInfo::EmitExternalVariable(llvm::GlobalVariable *Var,
   Var->addDebugInfo(GVE);
 }
 
+void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
+                                     llvm::Instruction *Value, QualType Ty) {
+  // Only when -g2 or above is specified, debug info for variables will be
+  // generated.
+  if (CGM.getCodeGenOpts().getDebugInfo() <=
+      llvm::codegenoptions::DebugLineTablesOnly)
+    return;
+
+  llvm::DebugLoc SaveDebugLoc = Builder.getCurrentDebugLocation();
+  if (!SaveDebugLoc.get())
+    return;
+
+  llvm::DIFile *Unit = SaveDebugLoc->getFile();
+  llvm::DIType *Type = getOrCreateType(Ty, Unit);
+
+  // Check if Value is already a declared variable and has debug info, in this
+  // case we have nothing to do. Clang emits declared variable as alloca, and
+  // it is loaded upon use, so we identify such pattern here.
+  if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Value)) {
+    llvm::Value *Var = Load->getPointerOperand();
+    if (llvm::Metadata *MDValue = llvm::ValueAsMetadata::getIfExists(Var)) {
+      if (llvm::Value *DbgValue = llvm::MetadataAsValue::getIfExists(
+              CGM.getLLVMContext(), MDValue)) {
+        for (llvm::User *U : DbgValue->users()) {
+          if (llvm::CallInst *DbgDeclare = dyn_cast<llvm::CallInst>(U)) {
+            if (DbgDeclare->getCalledFunction()->getIntrinsicID() ==
+                    llvm::Intrinsic::dbg_declare &&
+                DbgDeclare->getArgOperand(0) == DbgValue) {
+              // There can be implicit type cast applied on a variable if it is
+              // an opaque ptr, in this case its debug info may not match the
+              // actual type of object being used as in the next instruction, so
+              // we will need to emit a pseudo variable for type-casted value.
+              llvm::DILocalVariable *MDNode = cast<llvm::DILocalVariable>(
+                  cast<llvm::MetadataAsValue>(DbgDeclare->getOperand(1))
+                      ->getMetadata());
+              if (MDNode->getType() == Type)
+                return;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Find the correct location to insert a sequence of instructions to
+  // materialize Value on the stack.
+  auto SaveInsertionPoint = Builder.saveIP();
+  if (llvm::InvokeInst *Invoke = dyn_cast<llvm::InvokeInst>(Value))
+    Builder.SetInsertPoint(Invoke->getNormalDest()->begin());
+  else if (llvm::Instruction *Next = Value->getIterator()->getNextNode())
+    Builder.SetInsertPoint(Next);
+  else
+    Builder.SetInsertPoint(Value->getParent());
+  llvm::DebugLoc DL = Value->getDebugLoc();
+  if (DL.get())
+    Builder.SetCurrentDebugLocation(DL);
+  else if (!Builder.getCurrentDebugLocation().get())
+    Builder.SetCurrentDebugLocation(SaveDebugLoc);
+
+  llvm::AllocaInst *PseudoVar = Builder.CreateAlloca(Value->getType());
+  Address PseudoVarAddr(PseudoVar, Value->getType(),
+                        CharUnits::fromQuantity(PseudoVar->getAlign()));
+  llvm::LoadInst *Load = Builder.CreateLoad(PseudoVarAddr);
+  Value->replaceAllUsesWith(Load);
+  Builder.SetInsertPoint(Load);
+  Builder.CreateStore(Value, PseudoVarAddr);
+
+  // Emit debug info for materialized Value.
+  unsigned Line = Builder.getCurrentDebugLocation().getLine();
+  unsigned Column = Builder.getCurrentDebugLocation().getCol();
+  llvm::DILocalVariable *D = DBuilder.createAutoVariable(
+      LexicalBlockStack.back(), "", nullptr, 0, Type, false,
+      llvm::DINode::FlagArtificial);
+  llvm::DILocation *DIL =
+      llvm::DILocation::get(CGM.getLLVMContext(), Line, Column,
+                            LexicalBlockStack.back(), CurInlinedAt);
+  SmallVector<uint64_t> Expr;
+  DBuilder.insertDeclare(PseudoVar, D, DBuilder.createExpression(Expr), DIL,
+                         Load);
+
+  Builder.restoreIP(SaveInsertionPoint);
+  Builder.SetCurrentDebugLocation(SaveDebugLoc);
+}
+
 void CGDebugInfo::EmitGlobalAlias(const llvm::GlobalValue *GV,
                                   const GlobalDecl GD) {
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index d6db4d711366ac..614316f3fc7fd8 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -529,6 +529,12 @@ class CGDebugInfo {
   /// Emit information about an external variable.
   void EmitExternalVariable(llvm::GlobalVariable *GV, const VarDecl *Decl);
 
+  /// Emit a pseudo variable and debug info for an intermediate value if it does
+  /// not correspond to a variable in the source code, so that a profiler can
+  /// track more accurate usage of certain instructions of interest.
+  void EmitPseudoVariable(CGBuilderTy &Builder, llvm::Instruction *Value,
+                          QualType Ty);
+
   /// Emit information about global variable alias.
   void EmitGlobalAlias(const llvm::GlobalValue *GV, const GlobalDecl Decl);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 1b144c178ce960..58f0a3113b4f81 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1937,7 +1937,26 @@ Value *ScalarExprEmitter::VisitMemberExpr(MemberExpr *E) {
     }
   }
 
-  return EmitLoadOfLValue(E);
+  llvm::Value *Result = EmitLoadOfLValue(E);
+
+  // If -fdebug-info-for-profiling is specified, emit a pseudo variable and its
+  // debug info for the pointer, even if there is no variable associated with
+  // the pointer's expression.
+  if (CGF.CGM.getCodeGenOpts().DebugInfoForProfiling && CGF.getDebugInfo()) {
+    if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Result)) {
+      if (llvm::GetElementPtrInst *GEP =
+              dyn_cast<llvm::GetElementPtrInst>(Load->getPointerOperand())) {
+        if (llvm::Instruction *Pointer =
+                dyn_cast<llvm::Instruction>(GEP->getPointerOperand())) {
+          QualType Ty = E->getBase()->getType();
+          if (!E->isArrow())
+            Ty = CGF.getContext().getPointerType(Ty);
+          CGF.getDebugInfo()->EmitPseudoVariable(Builder, Pointer, Ty);
+        }
+      }
+    }
+  }
+  return Result;
 }
 
 Value *ScalarExprEmitter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
diff --git a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
new file mode 100644
index 00000000000000..8e465a1febf7ce
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
@@ -0,0 +1,120 @@
+// Test debug info for intermediate value of a chained pointer deferencing
+// expression when the flag -fdebug-info-for-pointer-type is enabled.
+// RUN: %clang_cc1 %s -fdebug-info-for-profiling -debug-info-kind=constructor -S -emit-llvm -o - | FileCheck %s
+
+class A {
+public:
+  int i;
+  char c;
+  void *p;
+  int arr[3];
+};
+
+class B {
+public:
+  A* a;
+};
+
+class C {
+public:
+  B* b;
+  A* a;
+  A arr[10];
+};
+
+// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func1{{.*}}(
+// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.B, ptr {{%.*}}, i32 0, i32 0, !dbg [[DBG1:![0-9]+]]
+// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]], align {{.*}}, !dbg [[DBG1]]
+// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr, align {{.*}}, !dbg [[DBG1]]
+// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
+// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META1:![0-9]+]], metadata !DIExpression()), !dbg [[DBG1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
+// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
+int func1(B *b) {
+  return b->a->i;
+}
+
+// Should generate a pseudo variable when pointer is type-casted.
+// CHECK-LABEL: define dso_local noundef ptr @{{.*}}func2{{.*}}(
+// CHECK:         call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META2:![0-9]+]], metadata !DIExpression())
+// CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[B_ADDR]],
+// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
+// CHECK-NEXT:    store ptr [[B]], ptr [[PSEUDO1]],
+// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META3:![0-9]+]], metadata !DIExpression())
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
+// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.B, ptr [[TMP1]], i32 0,
+A* func2(void *b) {
+  return ((B*)b)->a;
+}
+
+// Should not generate pseudo variable in this case.
+// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func3{{.*}}(
+// CHECK:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META4:![0-9]+]], metadata !DIExpression())
+// CHECK:    call void @llvm.dbg.declare(metadata ptr [[LOCAL1:%.*]], metadata [[META5:![0-9]+]], metadata !DIExpression())
+// CHECK-NOT: call void @llvm.dbg.declare(metadata ptr
+int func3(B *b) {
+  A *local1 = b->a;
+  return local1->i;
+}
+
+// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func4{{.*}}(
+// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.C, ptr {{%.*}}, i32 0, i32 1
+// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]],
+// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
+// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]],
+// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META6:![0-9]+]], metadata !DIExpression())
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
+// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
+// CHECK:         [[CALL:%.*]] = call noundef ptr @{{.*}}foo{{.*}}(
+// CHECK-NEXT:    [[PSEUDO2:%.*]] = alloca ptr,
+// CHECK-NEXT:    store ptr [[CALL]], ptr [[PSEUDO2]]
+// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO2]], metadata [[META6]], metadata !DIExpression())
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PSEUDO2]]
+// CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds %class.A, ptr [[TMP2]], i32 0, i32 1
+char func4(C *c) {
+  extern A* foo(int x);
+  return foo(c->a->i)->c;
+}
+
+// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func5{{.*}}(
+// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META7:![0-9]+]], metadata !DIExpression())
+// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META8:![0-9]+]], metadata !DIExpression())
+// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.A, ptr {{%.*}}, i64 {{%.*}},
+// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
+// CHECK-NEXT:    store ptr [[A_ADDR]], ptr [[PSEUDO1]],
+// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META9:![0-9]+]], metadata !DIExpression())
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
+// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 1,
+char func5(void *arr, int n) {
+  return ((A*)arr)[n].c;
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func6{{.*}}(
+// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META10:![0-9]+]], metadata !DIExpression())
+// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META11:![0-9]+]], metadata !DIExpression())
+int func6(B &b) {
+  return reinterpret_cast<A&>(b).i;
+}
+
+// CHECK-DAG: [[META_A:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A",
+// CHECK-DAG: [[META_AP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_A]],
+// CHECK-DAG: [[META_B:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B",
+// CHECK-DAG: [[META_BP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_B]],
+// CHECK-DAG: [[META_C:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C",
+// CHECK-DAG: [[META_CP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_C]],
+// CHECK-DAG: [[META_VP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null,
+// CHECK-DAG: [[META_I32:![0-9]+]] = !DIBasicType(name: "int", size: 32,
+// CHECK-DAG: [[META_BR:![0-9]+]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[META_B]],
+
+// CHECK-DAG: [[DBG1]] = !DILocation(line: 34, column: 13,
+// CHECK-DAG: [[META1]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
+// CHECK-DAG: [[META2]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 46, type: [[META_VP]])
+// CHECK-DAG: [[META3]] = !DILocalVariable(scope: {{.*}}, type: [[META_BP]], flags: DIFlagArtificial)
+// CHECK-DAG: [[META4]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 55, type: [[META_BP]])
+// CHECK-DAG: [[META5]] = !DILocalVariable(name: "local1", scope: {{.*}}, file: {{.*}}, line: 56, type: [[META_AP]])
+// CHECK-DAG: [[META6]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
+// CHECK-DAG: [[META7]] = !DILocalVariable(name: "arr", arg: 1, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_VP]])
+// CHECK-DAG: [[META8]] = !DILocalVariable(name: "n", arg: 2, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_I32]])
+// CHECK-DAG: [[META9]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
+// CHECK-DAG: [[META10]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 95, type: [[META_BR]])
+// CHECK-DAG: [[META11]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)

From c3e6bd0b09965df218f86ebb6f1e59a570937c5d Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 29 May 2024 15:26:18 -0700
Subject: [PATCH 190/230] [NFC] [MTE] remove unused functions from test

---
 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
index 26a0aa614c98b8..91dcffd77ce4b0 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
@@ -6,15 +6,7 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android10000"
 
-declare void @use8(ptr)
 declare void @use32(ptr)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-
-define dso_local void @noUse32(ptr) sanitize_memtag {
-entry:
-  ret void
-}
 
 define void @OneVar() sanitize_memtag {
 entry:

From e4b424afc4fbfe31ea1876114b4e9232efbf2297 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 16:27:06 -0600
Subject: [PATCH 191/230] [CI] Disable Flang from pre-commit tests when Flang
 files are not touched on Windows Only (#93729)

Flang triggers some OOM on Windows CI right now. This is disruptive to
MLIR and LLVM changes that don't touch Flang, as such we disable
building Flang on Windows only for these PR that don't touch flang. The
testing on Linux is unchanged, and the post-merge Windows testing is
still fully covering here.
---
 .ci/generate-buildkite-pipeline-premerge | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 3ed5eb96eceb58..bb7d2117e277fb 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -53,7 +53,10 @@ echo "Directories modified:" >&2
 echo "$modified_dirs" >&2
 
 function compute-projects-to-test() {
+  isForWindows=$1
+  shift
   projects=${@}
+  echo "isForWindows : $isForWindows ; projects: $projects " >&2
   for project in ${projects}; do
     echo "${project}"
     case ${project} in
@@ -63,9 +66,13 @@ function compute-projects-to-test() {
       done
     ;;
     llvm)
-      for p in bolt clang clang-tools-extra flang lld lldb mlir polly; do
+      for p in bolt clang clang-tools-extra lld lldb mlir polly; do
         echo $p
       done
+      # Flang is not stable in Windows CI at the moment
+      if [[ $isForWindows == 0 ]]; then
+        echo flang
+      fi
     ;;
     clang)
       for p in clang-tools-extra compiler-rt lldb cross-project-tests; do
@@ -76,7 +83,10 @@ function compute-projects-to-test() {
       echo libc
     ;;
     mlir)
-      echo flang
+      # Flang is not stable in Windows CI at the moment
+      if [[ $isForWindows == 0 ]]; then
+        echo flang
+      fi
     ;;
     *)
       # Nothing to do
@@ -241,7 +251,7 @@ fi
 all_projects="bolt clang clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
 modified_projects="$(keep-modified-projects ${all_projects})"
 
-linux_projects_to_test=$(exclude-linux $(compute-projects-to-test ${modified_projects}))
+linux_projects_to_test=$(exclude-linux $(compute-projects-to-test 0 ${modified_projects}))
 linux_check_targets=$(check-targets ${linux_projects_to_test} | sort | uniq)
 linux_projects=$(add-dependencies ${linux_projects_to_test} | sort | uniq)
 
@@ -249,7 +259,7 @@ linux_runtimes_to_test=$(compute-runtimes-to-test ${linux_projects_to_test})
 linux_runtime_check_targets=$(check-targets ${linux_runtimes_to_test} | sort | uniq)
 linux_runtimes=$(echo ${linux_runtimes_to_test} | sort | uniq)
 
-windows_projects_to_test=$(exclude-windows $(compute-projects-to-test ${modified_projects}))
+windows_projects_to_test=$(exclude-windows $(compute-projects-to-test 1 ${modified_projects}))
 windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq)
 windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq)
 

From 66b9785670a7909d8301afdeac27991c9236f346 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 29 May 2024 15:28:06 -0700
Subject: [PATCH 192/230] [NFC] [MTE] make test more robust by not hardcoding
 %1

---
 llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
index 91dcffd77ce4b0..40bc8bdd70703a 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll
@@ -25,7 +25,7 @@ entry:
 ; INSTR:  [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0)
 ; INSTR:  [[TLS:%.*]] = call ptr @llvm.thread.pointer()
 ; INSTR:  [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24
-; INSTR:  [[TLS_VALUE:%.*]] = load i64, ptr %1, align 8
+; INSTR:  [[TLS_VALUE:%.*]] = load i64, ptr [[TLS_SLOT]], align 8
 ; INSTR:  [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
 ; INSTR:  [[FP_INT:%.*]] = ptrtoint ptr [[FP]] to i64
 ; INSTR:  [[BASE_INT:%.*]] = ptrtoint ptr [[BASE]] to i64

From 89129201fe4b825b2f6a0f3c9da8651603078d29 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 29 May 2024 15:36:33 -0700
Subject: [PATCH 193/230] [NFC] Move DIExpressionCursor to DebugInfoMetadata.h
 (#69768)

This is an NFC patch to move DIExpressionCursor to DebugInfoMetada.h, so
that it can be used by classes in that header file.

Specifically, I want to use DIExpressionCursor in a subsequent patch:
https://github.com/llvm/llvm-project/pull/71718
---
 llvm/include/llvm/IR/DebugInfoMetadata.h      | 61 +++++++++++++++++++
 llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h | 61 -------------------
 2 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 42291d45da2bef..a1c554677f8bf2 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -3150,6 +3150,67 @@ template <> struct DenseMapInfo<DIExpression::FragmentInfo> {
   static bool isEqual(const FragInfo &A, const FragInfo &B) { return A == B; }
 };
 
+/// Holds a DIExpression and keeps track of how many operands have been consumed
+/// so far.
+class DIExpressionCursor {
+  DIExpression::expr_op_iterator Start, End;
+
+public:
+  DIExpressionCursor(const DIExpression *Expr) {
+    if (!Expr) {
+      assert(Start == End);
+      return;
+    }
+    Start = Expr->expr_op_begin();
+    End = Expr->expr_op_end();
+  }
+
+  DIExpressionCursor(ArrayRef<uint64_t> Expr)
+      : Start(Expr.begin()), End(Expr.end()) {}
+
+  DIExpressionCursor(const DIExpressionCursor &) = default;
+
+  /// Consume one operation.
+  std::optional<DIExpression::ExprOperand> take() {
+    if (Start == End)
+      return std::nullopt;
+    return *(Start++);
+  }
+
+  /// Consume N operations.
+  void consume(unsigned N) { std::advance(Start, N); }
+
+  /// Return the current operation.
+  std::optional<DIExpression::ExprOperand> peek() const {
+    if (Start == End)
+      return std::nullopt;
+    return *(Start);
+  }
+
+  /// Return the next operation.
+  std::optional<DIExpression::ExprOperand> peekNext() const {
+    if (Start == End)
+      return std::nullopt;
+
+    auto Next = Start.getNext();
+    if (Next == End)
+      return std::nullopt;
+
+    return *Next;
+  }
+
+  /// Determine whether there are any operations left in this expression.
+  operator bool() const { return Start != End; }
+
+  DIExpression::expr_op_iterator begin() const { return Start; }
+  DIExpression::expr_op_iterator end() const { return End; }
+
+  /// Retrieve the fragment information, if any.
+  std::optional<DIExpression::FragmentInfo> getFragmentInfo() const {
+    return DIExpression::getFragmentInfo(Start, End);
+  }
+};
+
 /// Global variables.
 ///
 /// TODO: Remove DisplayName.  It's always equal to Name.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 667a9efc6f6c04..4daa78b15b8e29 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -31,67 +31,6 @@ class DIELoc;
 class TargetRegisterInfo;
 class MachineLocation;
 
-/// Holds a DIExpression and keeps track of how many operands have been consumed
-/// so far.
-class DIExpressionCursor {
-  DIExpression::expr_op_iterator Start, End;
-
-public:
-  DIExpressionCursor(const DIExpression *Expr) {
-    if (!Expr) {
-      assert(Start == End);
-      return;
-    }
-    Start = Expr->expr_op_begin();
-    End = Expr->expr_op_end();
-  }
-
-  DIExpressionCursor(ArrayRef<uint64_t> Expr)
-      : Start(Expr.begin()), End(Expr.end()) {}
-
-  DIExpressionCursor(const DIExpressionCursor &) = default;
-
-  /// Consume one operation.
-  std::optional<DIExpression::ExprOperand> take() {
-    if (Start == End)
-      return std::nullopt;
-    return *(Start++);
-  }
-
-  /// Consume N operations.
-  void consume(unsigned N) { std::advance(Start, N); }
-
-  /// Return the current operation.
-  std::optional<DIExpression::ExprOperand> peek() const {
-    if (Start == End)
-      return std::nullopt;
-    return *(Start);
-  }
-
-  /// Return the next operation.
-  std::optional<DIExpression::ExprOperand> peekNext() const {
-    if (Start == End)
-      return std::nullopt;
-
-    auto Next = Start.getNext();
-    if (Next == End)
-      return std::nullopt;
-
-    return *Next;
-  }
-
-  /// Determine whether there are any operations left in this expression.
-  operator bool() const { return Start != End; }
-
-  DIExpression::expr_op_iterator begin() const { return Start; }
-  DIExpression::expr_op_iterator end() const { return End; }
-
-  /// Retrieve the fragment information, if any.
-  std::optional<DIExpression::FragmentInfo> getFragmentInfo() const {
-    return DIExpression::getFragmentInfo(Start, End);
-  }
-};
-
 /// Base class containing the logic for constructing DWARF expressions
 /// independently of whether they are emitted into a DIE or into a .debug_loc
 /// entry.

From a3f9066e99f3685b4f2271f54ba73210396c00b4 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 29 May 2024 15:42:31 -0700
Subject: [PATCH 194/230] Add functions peekNextN(unsigned) and
 assignNewExpr(ArrayRef<uint64_t>) to DIExpressionCursor (#71717)

This commit adds two functions to the DIExpressionCursor class.

`peekNextN(unsigned)` works like peekNext, but lets you peek the next
Nth element

`assignNewExpr(ArrayRef<uint64_t>)` lets you assign a new expression to
the same DIExpressionCursor object

This is part of a stack of patches, it comes after
https://github.com/llvm/llvm-project/pull/69768
---
 llvm/include/llvm/IR/DebugInfoMetadata.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index a1c554677f8bf2..555bd623ad9ef7 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -3199,6 +3199,23 @@ class DIExpressionCursor {
     return *Next;
   }
 
+  std::optional<DIExpression::ExprOperand> peekNextN(unsigned N) const {
+    if (Start == End)
+      return std::nullopt;
+    DIExpression::expr_op_iterator Nth = Start;
+    for (unsigned I = 0; I < N; I++) {
+      Nth = Nth.getNext();
+      if (Nth == End)
+        return std::nullopt;
+    }
+    return *Nth;
+  }
+
+  void assignNewExpr(ArrayRef<uint64_t> Expr) {
+    this->Start = DIExpression::expr_op_iterator(Expr.begin());
+    this->End = DIExpression::expr_op_iterator(Expr.end());
+  }
+
   /// Determine whether there are any operations left in this expression.
   operator bool() const { return Start != End; }
 

From 11d7203c1d2f44085e105b8d4d726f2589f62f40 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov@intel.com>
Date: Thu, 30 May 2024 01:53:43 +0300
Subject: [PATCH 195/230] [X86][GlobalISel] Enable G_BUILD_VECTOR and
 G_CONSTANT_POOL (#92844)

* Add support for G_LOAD from G_CONSTANT_POOL on X86 and X64
* Add X86GlobalBaseRegPass to handle base register initialization for
X86.
* Fix vector type legalization for G_STORE and G_LOAD as well as enable
scalarization for them.
* Custom lower G_BUILD_VECTOR into G_LOAD from G_CONSTANT_POOL.
---
 .../X86/GISel/X86InstructionSelector.cpp      |  29 +++-
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 104 ++++++++++++-
 llvm/lib/Target/X86/GISel/X86LegalizerInfo.h  |   7 +
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   3 +
 llvm/test/CodeGen/X86/isel-buildvector-avx.ll | 112 ++++++++++++++
 llvm/test/CodeGen/X86/isel-buildvector-sse.ll | 143 ++++++++++++++++++
 .../test/CodeGen/X86/isel-buildvector-sse2.ll |  71 +++++++++
 7 files changed, 459 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/isel-buildvector-avx.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-buildvector-sse.ll
 create mode 100644 llvm/test/CodeGen/X86/isel-buildvector-sse2.ll

diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 9be3812300af12..303783ea3fd223 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -548,7 +548,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   unsigned Opc = I.getOpcode();
 
   assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
-         "unexpected instruction");
+         "Only G_STORE and G_LOAD are expected for selection");
 
   const Register DefReg = I.getOperand(0).getReg();
   LLT Ty = MRI.getType(DefReg);
@@ -576,11 +576,32 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   if (NewOpc == Opc)
     return false;
 
-  X86AddressMode AM;
-  X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
-
   I.setDesc(TII.get(NewOpc));
   MachineInstrBuilder MIB(MF, I);
+  const MachineInstr *Ptr = MRI.getVRegDef(I.getOperand(1).getReg());
+
+  if (Ptr->getOpcode() == TargetOpcode::G_CONSTANT_POOL) {
+    assert(Opc == TargetOpcode::G_LOAD &&
+           "Only G_LOAD from constant pool is expected");
+    // TODO: Need a separate move for Large model
+    if (TM.getCodeModel() == CodeModel::Large)
+      return false;
+
+    unsigned char OpFlag = STI.classifyLocalReference(nullptr);
+    unsigned PICBase = 0;
+    if (OpFlag == X86II::MO_GOTOFF)
+      PICBase = TII.getGlobalBaseReg(&MF);
+    else if (STI.is64Bit())
+      PICBase = X86::RIP;
+
+    I.removeOperand(1);
+    addConstantPoolReference(MIB, Ptr->getOperand(1).getIndex(), PICBase,
+                             OpFlag);
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  X86AddressMode AM;
+  X86SelectAddress(*Ptr, MRI, AM);
   if (Opc == TargetOpcode::G_LOAD) {
     I.removeOperand(1);
     addFullAddress(MIB, AM);
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 07041cc5b0491c..dd8ecf6ef7fc76 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -13,7 +13,10 @@
 #include "X86LegalizerInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -71,6 +74,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   const LLT v16s32 = LLT::fixed_vector(16, 32);
   const LLT v8s64 = LLT::fixed_vector(8, 64);
 
+  const LLT s8MaxVector = HasAVX512 ? v64s8 : HasAVX ? v32s8 : v16s8;
+  const LLT s16MaxVector = HasAVX512 ? v32s16 : HasAVX ? v16s16 : v8s16;
+  const LLT s32MaxVector = HasAVX512 ? v16s32 : HasAVX ? v8s32 : v4s32;
+  const LLT s64MaxVector = HasAVX512 ? v8s64 : HasAVX ? v4s64 : v2s64;
+
   // todo: AVX512 bool vector predicate types
 
   // implicit/constants
@@ -338,6 +346,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, sMaxScalar}});
 
+  getActionDefinitionsBuilder(G_CONSTANT_POOL).legalFor({p0});
+
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalIf([=](const LegalityQuery &Query) -> bool {
         return typePairInSet(0, 1, {{p0, s32}})(Query) ||
@@ -368,9 +378,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
                                        {s64, p0, s64, 1},
                                        {v2s32, p0, v2s32, 1}});
     if (HasSSE1)
+      Action.legalForTypesWithMemDesc({{v4s32, p0, v4s32, 1}});
+    if (HasSSE2)
       Action.legalForTypesWithMemDesc({{v16s8, p0, v16s8, 1},
                                        {v8s16, p0, v8s16, 1},
-                                       {v4s32, p0, v4s32, 1},
                                        {v2s64, p0, v2s64, 1},
                                        {v2p0, p0, v2p0, 1}});
     if (HasAVX)
@@ -384,7 +395,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
                                        {v32s16, p0, v32s16, 1},
                                        {v16s32, p0, v16s32, 1},
                                        {v8s64, p0, v8s64, 1}});
-    Action.widenScalarToNextPow2(0, /*Min=*/8).clampScalar(0, s8, sMaxScalar);
+    Action.widenScalarToNextPow2(0, /*Min=*/8)
+        .clampScalar(0, s8, sMaxScalar)
+        .scalarize(0);
   }
 
   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
@@ -406,10 +419,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
           (Query.Opcode == G_ANYEXT && Query.Types[0] == s128) ||
           (Is64Bit && Query.Types[0] == s64);
       })
-    .widenScalarToNextPow2(0, /*Min=*/8)
-    .clampScalar(0, s8, sMaxScalar)
-    .widenScalarToNextPow2(1, /*Min=*/8)
-    .clampScalar(1, s8, sMaxScalar);
+      .widenScalarToNextPow2(0, /*Min=*/8)
+      .clampScalar(0, s8, sMaxScalar)
+      .widenScalarToNextPow2(1, /*Min=*/8)
+      .clampScalar(1, s8, sMaxScalar)
+      .scalarize(0);
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
@@ -484,6 +498,19 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(1);
 
   // vector ops
+  getActionDefinitionsBuilder(G_BUILD_VECTOR)
+      .customIf([=](const LegalityQuery &Query) {
+        return (HasSSE1 && typeInSet(0, {v4s32})(Query)) ||
+               (HasSSE2 && typeInSet(0, {v2s64, v8s16, v16s8})(Query)) ||
+               (HasAVX && typeInSet(0, {v4s64, v8s32, v16s16, v32s8})(Query)) ||
+               (HasAVX512 && typeInSet(0, {v8s64, v16s32, v32s16, v64s8}));
+      })
+      .clampNumElements(0, v16s8, s8MaxVector)
+      .clampNumElements(0, v8s16, s16MaxVector)
+      .clampNumElements(0, v4s32, s32MaxVector)
+      .clampNumElements(0, v2s64, s64MaxVector)
+      .moreElementsToNextPow2(0);
+
   getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
       .legalIf([=](const LegalityQuery &Query) {
         unsigned SubIdx = Query.Opcode == G_EXTRACT ? 0 : 1;
@@ -552,6 +579,71 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   verify(*STI.getInstrInfo());
 }
 
+bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                                      LostDebugLocObserver &LocObserver) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  switch (MI.getOpcode()) {
+  default:
+    // No idea what to do.
+    return false;
+  case TargetOpcode::G_BUILD_VECTOR:
+    return legalizeBuildVector(MI, MRI, Helper);
+  }
+  llvm_unreachable("expected switch to return");
+}
+
+bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
+                                           MachineRegisterInfo &MRI,
+                                           LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  const auto &BuildVector = cast<GBuildVector>(MI);
+  Register Dst = BuildVector.getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  MachineFunction &MF = MIRBuilder.getMF();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  uint64_t DstTySize = DstTy.getScalarSizeInBits();
+
+  SmallVector<Constant *, 4> CstIdxs;
+  for (unsigned i = 0; i < BuildVector.getNumSources(); ++i) {
+    Register Source = BuildVector.getSourceReg(i);
+
+    auto ValueAndReg = getIConstantVRegValWithLookThrough(Source, MRI);
+    if (ValueAndReg) {
+      CstIdxs.emplace_back(ConstantInt::get(Ctx, ValueAndReg->Value));
+      continue;
+    }
+
+    auto FPValueAndReg = getFConstantVRegValWithLookThrough(Source, MRI);
+    if (FPValueAndReg) {
+      CstIdxs.emplace_back(ConstantFP::get(Ctx, FPValueAndReg->Value));
+      continue;
+    }
+
+    if (getOpcodeDef<GImplicitDef>(Source, MRI)) {
+      CstIdxs.emplace_back(UndefValue::get(Type::getIntNTy(Ctx, DstTySize)));
+      continue;
+    }
+    return false;
+  }
+
+  Constant *ConstVal = ConstantVector::get(CstIdxs);
+
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+  Align Alignment(DL.getABITypeAlign(ConstVal->getType()));
+  auto Addr = MIRBuilder.buildConstantPool(
+      LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)),
+      MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+                              MachineMemOperand::MOLoad, DstTy, Alignment);
+
+  MIRBuilder.buildLoad(Dst, Addr, *MMO);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 12134f7b00f1cf..229a58986903d4 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -30,8 +30,15 @@ class X86LegalizerInfo : public LegalizerInfo {
 public:
   X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
 
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
+                      LostDebugLocObserver &LocObserver) const override;
+
   bool legalizeIntrinsic(LegalizerHelper &Helper,
                          MachineInstr &MI) const override;
+
+private:
+  bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           LegalizerHelper &Helper) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 86b456019c4e56..ab59cf8a309a16 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -505,6 +505,9 @@ bool X86PassConfig::addRegBankSelect() {
 
 bool X86PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect(getOptLevel()));
+  // Add GlobalBaseReg in case there is no SelectionDAG passes afterwards
+  if (isGlobalISelAbortEnabled())
+    addPass(createX86GlobalBaseRegPass());
   return false;
 }
 
diff --git a/llvm/test/CodeGen/X86/isel-buildvector-avx.ll b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll
new file mode 100644
index 00000000000000..91abfff2a34246
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx  %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -fast-isel %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f  %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX512
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -fast-isel %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX512
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes AVX-ALL,AVX512
+
+;
+; 256 bit vectors
+;
+
+define <32 x i8> @test_vector_v32i8() {
+; AVX-ALL-LABEL: test_vector_v32i8:
+; AVX-ALL:       # %bb.0:
+; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [99,2,7,77,56,5,48,73,36,63,68,13,59,34,36,117,43,11,61,97,104,113,46,89,42,12,97,41,73,7,55,73]
+; AVX-ALL-NEXT:    retq
+  ret <32 x i8> <i8 99, i8 2, i8 7, i8 77, i8 56, i8 5, i8 48, i8 73, i8 36, i8 63, i8 68, i8 13, i8 59, i8 34, i8 36, i8 117, i8 43, i8 11, i8 61, i8 97, i8 104, i8 113, i8 46, i8 89, i8 42, i8 12, i8 97, i8 41, i8 73, i8 7, i8 55, i8 73>
+}
+
+define <16 x i16> @test_vector_v16i16() {
+; AVX-ALL-LABEL: test_vector_v16i16:
+; AVX-ALL:       # %bb.0:
+; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [2415,4748,23790,5373,22059,21582,12346,30507,9170,21469,12631,24765,31001,26396,24951,27843]
+; AVX-ALL-NEXT:    retq
+  ret <16 x i16> <i16 2415, i16 4748, i16 23790, i16 5373, i16 22059, i16 21582, i16 12346, i16 30507, i16 9170, i16 21469, i16 12631, i16 24765, i16 31001, i16 26396, i16 24951, i16 27843>
+}
+
+define <5 x float> @test_vector_v5f32() {
+; AVX-ALL-LABEL: test_vector_v5f32:
+; AVX-ALL:       # %bb.0:
+; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [6.135E+3,2.179E+4,2.8365E+4,6.641E+3,2.6535E+4,u,u,u]
+; AVX-ALL-NEXT:    retq
+  ret <5 x float> <float 6135., float 21790., float 28365., float 6641., float 26535.>
+}
+
+define <8 x float> @test_vector_v8f32() {
+; AVX-ALL-LABEL: test_vector_v8f32:
+; AVX-ALL:       # %bb.0:
+; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [6.135E+3,2.179E+4,2.8365E+4,6.641E+3,2.6535E+4,2.1447E+4,1.9619E+4,1.1916E+4]
+; AVX-ALL-NEXT:    retq
+  ret <8 x float> <float 6135., float 21790., float 28365., float 6641., float 26535., float 21447., float 19619., float 11916.>
+}
+
+define <4 x i64> @test_vector_v4i64() {
+; AVX-ALL-LABEL: test_vector_v4i64:
+; AVX-ALL:       # %bb.0:
+; AVX-ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [23430,24650,1,12]
+; AVX-ALL-NEXT:    retq
+  ret <4 x i64> <i64 23430, i64 24650, i64 1, i64 12>
+}
+
+;
+; 512 bit vectors
+;
+
+define <64 x i8> @test_vector_v64i8() {
+; AVX-X64-LABEL: test_vector_v64i8:
+; AVX-LABEL: test_vector_v64i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,84,15,13,66,11,70,102,12,82,111,109,61,15,70,8,110,17,35,102,57,111,119,61,112,47,3,34,65,126,55,37]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [9,100,124,46,65,75,68,70,120,109,125,21,98,121,127,13,119,64,2,0,9,79,10,78,53,81,37,95,99,79,114,3]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_vector_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,84,15,13,66,11,70,102,12,82,111,109,61,15,70,8,110,17,35,102,57,111,119,61,112,47,3,34,65,126,55,37,9,100,124,46,65,75,68,70,120,109,125,21,98,121,127,13,119,64,2,0,9,79,10,78,53,81,37,95,99,79,114,3]
+; AVX512-NEXT:    retq
+  ret <64 x i8> <i8 0, i8 84, i8 15, i8 13, i8 66, i8 11, i8 70, i8 102, i8 12, i8 82, i8 111, i8 109, i8 61, i8 15, i8 70, i8 8, i8 110, i8 17, i8 35, i8 102, i8 57, i8 111, i8 119, i8 61, i8 112, i8 47, i8 3, i8 34, i8 65, i8 126, i8 55, i8 37, i8 9, i8 100, i8 124, i8 46, i8 65, i8 75, i8 68, i8 70, i8 120, i8 109, i8 125, i8 21, i8 98, i8 121, i8 127, i8 13, i8 119, i8 64, i8 2, i8 0, i8 9, i8 79, i8 10, i8 78, i8 53, i8 81, i8 37, i8 95, i8 99, i8 79, i8 114, i8 3>
+}
+
+define <32 x i16> @test_vector_v32i16() {
+; AVX-LABEL: test_vector_v32i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [30901,2280,10793,13893,17914,6183,27317,29748,27420,12395,13504,18229,14700,11550,24714,26203]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [23668,3198,27016,12020,31057,19311,16505,24461,28451,19446,23816,10995,17209,5831,27666,21680]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_vector_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [30901,2280,10793,13893,17914,6183,27317,29748,27420,12395,13504,18229,14700,11550,24714,26203,23668,3198,27016,12020,31057,19311,16505,24461,28451,19446,23816,10995,17209,5831,27666,21680]
+; AVX512-NEXT:    retq
+  ret <32 x i16> <i16 30901, i16 2280, i16 10793, i16 13893, i16 17914, i16 6183, i16 27317, i16 29748, i16 27420, i16 12395, i16 13504, i16 18229, i16 14700, i16 11550, i16 24714, i16 26203, i16 23668, i16 3198, i16 27016, i16 12020, i16 31057, i16 19311, i16 16505, i16 24461, i16 28451, i16 19446, i16 23816, i16 10995, i16 17209, i16 5831, i16 27666, i16 21680>
+}
+
+define <16 x i32> @test_vector_v16i32() {
+; AVX-LABEL: test_vector_v16i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [867316,75798,646113,495494,920699,901516,613751,811205]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [778508,933022,441446,241046,364018,527717,71828,337100]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_vector_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [867316,75798,646113,495494,920699,901516,613751,811205,778508,933022,441446,241046,364018,527717,71828,337100]
+; AVX512-NEXT:    retq
+  ret <16 x i32> <i32 867316, i32 75798, i32 646113, i32 495494, i32 920699, i32 901516, i32 613751, i32 811205, i32 778508, i32 933022, i32 441446, i32 241046, i32 364018, i32 527717, i32 71828, i32 337100>
+}
+
+define <8 x double> @test_vector_v8f64() {
+; AVX-LABEL: test_vector_v8f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [6.1349999999999998E+0,2.1789999999999998E+0,2.8365E+0,6.641E+0]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [2.6535000000000002E+0,2.1446999999999998E+0,1.9619E+0,1.1916E+0]
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_vector_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [6.1349999999999998E+0,2.1789999999999998E+0,2.8365E+0,6.641E+0,2.6535000000000002E+0,2.1446999999999998E+0,1.9619E+0,1.1916E+0]
+; AVX512-NEXT:    retq
+  ret <8 x double> <double 6.135, double 2.1790, double 2.8365, double 6.641, double 2.6535, double 2.1447, double 1.9619, double 1.1916>
+}
diff --git a/llvm/test/CodeGen/X86/isel-buildvector-sse.ll b/llvm/test/CodeGen/X86/isel-buildvector-sse.ll
new file mode 100644
index 00000000000000..5b96d57cf019bc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-buildvector-sse.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse,-sse2  %s -o - | FileCheck %s --check-prefixes SSE-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse,-sse2 -fast-isel %s -o - | FileCheck %s --check-prefixes SSE-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse,-sse2 -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes SSE-X64-GISEL
+; RUN: llc -mtriple=i686-linux-gnu -mattr=+sse,-sse2  %s -o - | FileCheck %s --check-prefixes SSE-X86
+; RUN: llc -mtriple=i686-linux-gnu -mattr=+sse,-sse2 -fast-isel %s -o - | FileCheck %s --check-prefixes SSE-X86
+; RUN: llc -mtriple=i686-linux-gnu -mattr=+sse,-sse2 -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes SSE-X86-GISEL
+
+define <8 x i32> @test_vector_v8i32() {
+; SSE-X64-LABEL: test_vector_v8i32:
+; SSE-X64:       # %bb.0:
+; SSE-X64-NEXT:    movq %rdi, %rax
+; SSE-X64-NEXT:    movabsq $3043555126665690671, %rcx # imm = 0x2A3CE143233A3E2F
+; SSE-X64-NEXT:    movq %rcx, 24(%rdi)
+; SSE-X64-NEXT:    movabsq $-2720818644236378031, %rcx # imm = 0xDA3DB5DBCC07E051
+; SSE-X64-NEXT:    movq %rcx, 16(%rdi)
+; SSE-X64-NEXT:    movabsq $3043545045377446960, %rcx # imm = 0x2A3CD817E79F7430
+; SSE-X64-NEXT:    movq %rcx, 8(%rdi)
+; SSE-X64-NEXT:    movabsq $-2715530310134355376, %rcx # imm = 0xDA507F9207A2AA50
+; SSE-X64-NEXT:    movq %rcx, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; SSE-X64-GISEL-LABEL: test_vector_v8i32:
+; SSE-X64-GISEL:       # %bb.0:
+; SSE-X64-GISEL-NEXT:    movl $128100944, %eax # imm = 0x7A2AA50
+; SSE-X64-GISEL-NEXT:    movl $-632258670, %ecx # imm = 0xDA507F92
+; SSE-X64-GISEL-NEXT:    movl $-408980432, %edx # imm = 0xE79F7430
+; SSE-X64-GISEL-NEXT:    movl $708630551, %esi # imm = 0x2A3CD817
+; SSE-X64-GISEL-NEXT:    movl $-871899055, %r8d # imm = 0xCC07E051
+; SSE-X64-GISEL-NEXT:    movl $-633489957, %r9d # imm = 0xDA3DB5DB
+; SSE-X64-GISEL-NEXT:    movl $591019567, %r10d # imm = 0x233A3E2F
+; SSE-X64-GISEL-NEXT:    movl $708632899, %r11d # imm = 0x2A3CE143
+; SSE-X64-GISEL-NEXT:    movl %eax, (%rdi)
+; SSE-X64-GISEL-NEXT:    movl %ecx, 4(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %edx, 8(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %esi, 12(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %r8d, 16(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %r9d, 20(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %r10d, 24(%rdi)
+; SSE-X64-GISEL-NEXT:    movl %r11d, 28(%rdi)
+; SSE-X64-GISEL-NEXT:    retq
+;
+; SSE-X86-LABEL: test_vector_v8i32:
+; SSE-X86:       # %bb.0:
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-NEXT:    movl $708632899, 28(%eax) # imm = 0x2A3CE143
+; SSE-X86-NEXT:    movl $591019567, 24(%eax) # imm = 0x233A3E2F
+; SSE-X86-NEXT:    movl $-633489957, 20(%eax) # imm = 0xDA3DB5DB
+; SSE-X86-NEXT:    movl $-871899055, 16(%eax) # imm = 0xCC07E051
+; SSE-X86-NEXT:    movl $708630551, 12(%eax) # imm = 0x2A3CD817
+; SSE-X86-NEXT:    movl $-408980432, 8(%eax) # imm = 0xE79F7430
+; SSE-X86-NEXT:    movl $-632258670, 4(%eax) # imm = 0xDA507F92
+; SSE-X86-NEXT:    movl $128100944, (%eax) # imm = 0x7A2AA50
+; SSE-X86-NEXT:    retl $4
+;
+; SSE-X86-GISEL-LABEL: test_vector_v8i32:
+; SSE-X86-GISEL:       # %bb.0:
+; SSE-X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-GISEL-NEXT:    movl $128100944, %ecx # imm = 0x7A2AA50
+; SSE-X86-GISEL-NEXT:    movl %ecx, (%eax)
+; SSE-X86-GISEL-NEXT:    movl $-632258670, %ecx # imm = 0xDA507F92
+; SSE-X86-GISEL-NEXT:    movl %ecx, 4(%eax)
+; SSE-X86-GISEL-NEXT:    movl $-408980432, %ecx # imm = 0xE79F7430
+; SSE-X86-GISEL-NEXT:    movl %ecx, 8(%eax)
+; SSE-X86-GISEL-NEXT:    movl $708630551, %ecx # imm = 0x2A3CD817
+; SSE-X86-GISEL-NEXT:    movl %ecx, 12(%eax)
+; SSE-X86-GISEL-NEXT:    movl $-871899055, %ecx # imm = 0xCC07E051
+; SSE-X86-GISEL-NEXT:    movl %ecx, 16(%eax)
+; SSE-X86-GISEL-NEXT:    movl $-633489957, %ecx # imm = 0xDA3DB5DB
+; SSE-X86-GISEL-NEXT:    movl %ecx, 20(%eax)
+; SSE-X86-GISEL-NEXT:    movl $591019567, %ecx # imm = 0x233A3E2F
+; SSE-X86-GISEL-NEXT:    movl %ecx, 24(%eax)
+; SSE-X86-GISEL-NEXT:    movl $708632899, %ecx # imm = 0x2A3CE143
+; SSE-X86-GISEL-NEXT:    movl %ecx, 28(%eax)
+; SSE-X86-GISEL-NEXT:    retl
+  ret <8 x i32> <i32 128100944, i32 3662708626, i32 3885986864, i32 708630551, i32 -871899055, i32 3661477339, i32 4885986863, i32 708632899>
+}
+
+define <4 x i32> @test_vector_v4i32() {
+; SSE-X64-LABEL: test_vector_v4i32:
+; SSE-X64:       # %bb.0:
+; SSE-X64-NEXT:    movq %rdi, %rax
+; SSE-X64-NEXT:    movabsq $3043545045377446960, %rcx # imm = 0x2A3CD817E79F7430
+; SSE-X64-NEXT:    movq %rcx, 8(%rdi)
+; SSE-X64-NEXT:    movabsq $-2715530310134355376, %rcx # imm = 0xDA507F9207A2AA50
+; SSE-X64-NEXT:    movq %rcx, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; SSE-X64-GISEL-LABEL: test_vector_v4i32:
+; SSE-X64-GISEL:       # %bb.0:
+; SSE-X64-GISEL-NEXT:    movaps {{.*#+}} xmm0 = [128100944,3662708626,3885986864,708630551]
+; SSE-X64-GISEL-NEXT:    movaps %xmm0, (%rdi)
+; SSE-X64-GISEL-NEXT:    retq
+;
+; SSE-X86-LABEL: test_vector_v4i32:
+; SSE-X86:       # %bb.0:
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-NEXT:    movl $708630551, 12(%eax) # imm = 0x2A3CD817
+; SSE-X86-NEXT:    movl $-408980432, 8(%eax) # imm = 0xE79F7430
+; SSE-X86-NEXT:    movl $-632258670, 4(%eax) # imm = 0xDA507F92
+; SSE-X86-NEXT:    movl $128100944, (%eax) # imm = 0x7A2AA50
+; SSE-X86-NEXT:    retl $4
+;
+; SSE-X86-GISEL-LABEL: test_vector_v4i32:
+; SSE-X86-GISEL:       # %bb.0:
+; SSE-X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-GISEL-NEXT:    movaps {{.*#+}} xmm0 = [128100944,3662708626,3885986864,708630551]
+; SSE-X86-GISEL-NEXT:    movaps %xmm0, (%eax)
+; SSE-X86-GISEL-NEXT:    retl
+  ret <4 x i32> <i32 128100944, i32 3662708626, i32 3885986864, i32 708630551>
+}
+
+define <3 x i32> @test_vector_v3i32() {
+; SSE-X64-LABEL: test_vector_v3i32:
+; SSE-X64:       # %bb.0:
+; SSE-X64-NEXT:    movl $128100944, %eax # imm = 0x7A2AA50
+; SSE-X64-NEXT:    movl $-632258670, %edx # imm = 0xDA507F92
+; SSE-X64-NEXT:    movl $-408980432, %ecx # imm = 0xE79F7430
+; SSE-X64-NEXT:    retq
+;
+; SSE-X64-GISEL-LABEL: test_vector_v3i32:
+; SSE-X64-GISEL:       # %bb.0:
+; SSE-X64-GISEL-NEXT:    movl $128100944, %eax # imm = 0x7A2AA50
+; SSE-X64-GISEL-NEXT:    movl $-632258670, %edx # imm = 0xDA507F92
+; SSE-X64-GISEL-NEXT:    movl $-408980432, %ecx # imm = 0xE79F7430
+; SSE-X64-GISEL-NEXT:    retq
+;
+; SSE-X86-LABEL: test_vector_v3i32:
+; SSE-X86:       # %bb.0:
+; SSE-X86-NEXT:    movl $128100944, %eax # imm = 0x7A2AA50
+; SSE-X86-NEXT:    movl $-632258670, %edx # imm = 0xDA507F92
+; SSE-X86-NEXT:    movl $-408980432, %ecx # imm = 0xE79F7430
+; SSE-X86-NEXT:    retl
+;
+; SSE-X86-GISEL-LABEL: test_vector_v3i32:
+; SSE-X86-GISEL:       # %bb.0:
+; SSE-X86-GISEL-NEXT:    movl $128100944, %eax # imm = 0x7A2AA50
+; SSE-X86-GISEL-NEXT:    movl $-632258670, %edx # imm = 0xDA507F92
+; SSE-X86-GISEL-NEXT:    movl $-408980432, %ecx # imm = 0xE79F7430
+; SSE-X86-GISEL-NEXT:    retl
+  ret <3 x i32> <i32 128100944, i32 3662708626, i32 3885986864>
+}
+
diff --git a/llvm/test/CodeGen/X86/isel-buildvector-sse2.ll b/llvm/test/CodeGen/X86/isel-buildvector-sse2.ll
new file mode 100644
index 00000000000000..88e0ede0d4b6fe
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-buildvector-sse2.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2  %s -o - | FileCheck %s --check-prefixes SSE2,SSE2-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -fast-isel %s -o - | FileCheck %s --check-prefixes SSE2,SSE2-X64
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -global-isel-abort=1 %s -o - | FileCheck %s --check-prefixes SSE2,SSE2-GISEL
+
+;
+; 128 bit vectors
+;
+
+define <7 x i8> @test_vector_v7i8() {
+; SSE2-X64-LABEL: test_vector_v7i8:
+; SSE2-X64:       # %bb.0:
+; SSE2-X64-NEXT:    movq %rdi, %rax
+; SSE2-X64-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; SSE2-X64-NEXT:    movl %ecx, (%rdi)
+; SSE2-X64-NEXT:    movb $63, 6(%rdi)
+; SSE2-X64-NEXT:    movw $10775, 4(%rdi) # imm = 0x2A17
+; SSE2-X64-NEXT:    retq
+;
+; SSE2-GISEL-LABEL: test_vector_v7i8:
+; SSE2-GISEL:       # %bb.0:
+; SSE2-GISEL-NEXT:    movb $4, %al
+; SSE2-GISEL-NEXT:    movb $8, %cl
+; SSE2-GISEL-NEXT:    movb $15, %dl
+; SSE2-GISEL-NEXT:    movb $16, %sil
+; SSE2-GISEL-NEXT:    movb $23, %r8b
+; SSE2-GISEL-NEXT:    movb $42, %r9b
+; SSE2-GISEL-NEXT:    movb $63, %r10b
+; SSE2-GISEL-NEXT:    movb %al, (%rdi)
+; SSE2-GISEL-NEXT:    movb %cl, 1(%rdi)
+; SSE2-GISEL-NEXT:    movb %dl, 2(%rdi)
+; SSE2-GISEL-NEXT:    movb %sil, 3(%rdi)
+; SSE2-GISEL-NEXT:    movb %r8b, 4(%rdi)
+; SSE2-GISEL-NEXT:    movb %r9b, 5(%rdi)
+; SSE2-GISEL-NEXT:    movb %r10b, 6(%rdi)
+; SSE2-GISEL-NEXT:    retq
+  ret <7 x i8> <i8 4, i8 8, i8 15, i8 16, i8 23, i8 42, i8 63>
+}
+
+define <16 x i8> @test_vector_v16i8() {
+; SSE2-LABEL: test_vector_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4,8,15,16,23,42,63,70,92,105,123,133,157,160,174,180]
+; SSE2-NEXT:    retq
+  ret <16 x i8> <i8 4, i8 8, i8 15, i8 16, i8 23, i8 42, i8 63, i8 70, i8 92, i8 105, i8 123, i8 133, i8 157, i8 160, i8 174, i8 180>
+}
+
+define <8 x i16> @test_vector_v8i16() {
+; SSE2-LABEL: test_vector_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4,15,23,63,92,123,157,174]
+; SSE2-NEXT:    retq
+  ret <8 x i16> <i16 4, i16 15, i16 23, i16 63, i16 92, i16 123, i16 157, i16 174>
+}
+
+define <4 x float> @test_vector_v4f32() {
+; SSE2-LABEL: test_vector_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,3.6627E+5,9.86864E+5,7.0851E+4]
+; SSE2-NEXT:    retq
+  ret <4 x float> <float undef, float 366270.0, float 986864.0, float 70851.0>
+}
+
+define <2 x i64> @test_vector_v4i64() {
+; SSE2-LABEL: test_vector_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [9406487659005566976,9903695591611287552]
+; SSE2-NEXT:    retq
+  ret <2 x i64> <i64 9406487659005566976, i64 9903695591611287552>
+}
+

From 6e7b45c55b4c3299c1160d49f7ad721c62e8e4eb Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86@yahoo.com>
Date: Wed, 29 May 2024 15:55:58 -0700
Subject: [PATCH 196/230] [AMDGPU][MC] Support tfe operand in image_atomic
 instructions (#92469)

Current, if an image_atomic instruction has the 'tfe' operand, the
llvm-mc assembler in general would reject it. The only exception is when
dmask is 0x1 and the instruction is not image_atomic_cmpswap (e.g.,
image_atomic_add v[5:6], v252, s[8:15] dmask:0x1 tfe). This patch fixes
this problem and allows tfe to be specified in image_atomic
instructions.

---------

Co-authored-by: Jun Wang <jun.wang7@amd.com>
---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td    |   4 +
 llvm/test/MC/AMDGPU/gfx10_asm_mimg.s          |  91 ++++++++++++
 llvm/test/MC/AMDGPU/gfx11_asm_mimg.s          |  91 ++++++++++++
 llvm/test/MC/AMDGPU/mimg.s                    | 129 ++++++++++++++++++
 .../AMDGPU/gfx8_mimg_features.txt             |   2 +-
 5 files changed, 316 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 24f9a6e375baaf..e1468bf850cd79 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1101,6 +1101,10 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
       defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP, renamed>;
       let VDataDwords = !if(isCmpSwap, 4, 2) in
       defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP, renamed>;
+      let VDataDwords = !if(isCmpSwap, 2, 2) in
+      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_96, 0, isFP, renamed>;
+      let VDataDwords = !if(isCmpSwap, 4, 4) in
+      defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_160, 0, isFP, renamed>;
     }
   } // End IsAtomicRet = 1
 }
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
index 7b137289aa8176..6039e4abf5d960 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg.s
@@ -654,3 +654,94 @@ image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dma
 
 image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
 ; GFX10: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x08,0x0f,0x88,0xf0,0x00,0x00,0x40,0x40]
+
+; Test dmask + tfe for image_atomic instructions
+image_atomic_add v0, v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10: image_atomic_add v0, v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x01,0x44,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x45,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D
+; GFX10: image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x03,0x44,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_add v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_add v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x45,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_swap v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_swap v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x3d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_swap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_swap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x3d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_sub v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_sub v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x49,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_sub v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_sub v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x49,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_smin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_smin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x51,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_smin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_smin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x51,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_umin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_umin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x55,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_umin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_umin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x55,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_smax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_smax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x59,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_smax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_smax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x59,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_umax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_umax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x5d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_umax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_umax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x5d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_and v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_and v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x61,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_and v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_and v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x61,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_or v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_or v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x65,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_or v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_or v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x65,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_xor v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_xor v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x69,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_xor v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_xor v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x69,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_inc v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_inc v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x6d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_inc v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_inc v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x6d,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_dec v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_dec v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x71,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_dec v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_dec v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x71,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D
+; GFX10: image_atomic_cmpswap v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x03,0x40,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_cmpswap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x41,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:3], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10: image_atomic_cmpswap v[0:3], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x40,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:4], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+; GFX10: image_atomic_cmpswap v[0:4], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x0f,0x41,0xf0,0x0a,0x00,0x04,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
index 6d467dfa1d8e18..88deaeff19fa3a 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg.s
@@ -5603,3 +5603,94 @@ image_store_pck v1, v[2:3], s[96:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA unorm a1
 
 image_store_pck v255, v[254:255], ttmp[8:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe
 // GFX11: [0x98,0x74,0x21,0xf0,0xfe,0xff,0x5d,0x00]
+
+; Test dmask + tfe for image_atomic instructions
+image_atomic_add v0, v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x01,0x30,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x30,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_add v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x03,0x30,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_add v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x30,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_swap v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x28,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_swap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x28,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_sub v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x34,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_sub v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x34,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_smin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x38,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_smin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x38,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_umin v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x3c,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_umin v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x3c,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_smax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x40,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_smax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x40,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_umax v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x44,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_umax v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x44,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_and v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x48,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_and v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x48,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_or v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x4c,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_or v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x4c,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_xor v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x50,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_xor v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x50,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_inc v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x54,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_inc v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x54,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_dec v[0:1], v[10:11], s[16:23] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x01,0x58,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_dec v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x58,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_cmpswap v[0:1], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x03,0x2c,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:2], v[10:11], s[16:23] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x03,0x2c,0xf0,0x0a,0x00,0x24,0x00]
+
+image_atomic_cmpswap v[0:3], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D
+// GFX11: [0x04,0x0f,0x2c,0xf0,0x0a,0x00,0x04,0x00]
+
+image_atomic_cmpswap v[0:4], v[10:11], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+// GFX11: [0x04,0x0f,0x2c,0xf0,0x0a,0x00,0x24,0x00]
diff --git a/llvm/test/MC/AMDGPU/mimg.s b/llvm/test/MC/AMDGPU/mimg.s
index 38927b40f33475..29e402d9496f16 100644
--- a/llvm/test/MC/AMDGPU/mimg.s
+++ b/llvm/test/MC/AMDGPU/mimg.s
@@ -439,6 +439,135 @@ image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf unorm glc
 // SICI:  image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf unorm glc ; encoding: [0x00,0x3f,0x40,0xf0,0xc0,0x04,0x07,0x00]
 // GFX89: image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf unorm glc ; encoding: [0x00,0x3f,0x44,0xf0,0xc0,0x04,0x07,0x00]
 
+; Test dmask + tfe for image_atomic instructions
+image_atomic_add v4, v10, s[8:15] dmask:0x1
+// SICI:  image_atomic_add v4, v10, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x44,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_add v4, v10, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x48,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_add v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_add v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x45,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_add v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x49,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_add v[4:5], v10, s[8:15] dmask:0x3
+// SICI:  image_atomic_add v[4:5], v10, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x44,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_add v[4:5], v10, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x48,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_add v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_add v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x45,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_add v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x49,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1
+// SICI:  image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 ; encoding: [0x00,0x01,0x3c,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 ; encoding: [0x00,0x01,0x40,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x1 tfe
+// SICI:  image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x1 tfe ; encoding: [0x00,0x01,0x3d,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x1 tfe ; encoding: [0x00,0x01,0x41,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x3
+// SICI:  image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x3 ; encoding: [0x00,0x03,0x3c,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_swap v[4:5], v[192:195], s[28:35] dmask:0x3 ; encoding: [0x00,0x03,0x40,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_swap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe
+// SICI:  image_atomic_swap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe ; encoding: [0x00,0x03,0x3d,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_swap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe ; encoding: [0x00,0x03,0x41,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_sub v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_sub v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x49,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_sub v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x4d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_sub v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_sub v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x49,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_sub v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x4d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_smin v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_smin v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x51,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_smin v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x51,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_smin v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_smin v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x51,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_smin v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x51,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_umin v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_umin v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x55,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_umin v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x55,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_umin v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_umin v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x55,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_umin v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x55,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_smax v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_smax v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x59,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_smax v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x59,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_smax v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_smax v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x59,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_smax v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x59,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_umax v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_umax v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x5d,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_umax v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x5d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_umax v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_umax v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x5d,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_umax v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x5d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_and v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_and v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x61,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_and v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x61,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_and v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_and v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x61,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_and v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x61,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_or v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_or v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x65,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_or v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x65,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_or v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_or v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x65,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_or v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x65,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_xor v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_xor v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x69,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_xor v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x69,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_xor v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_xor v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x69,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_xor v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x69,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_inc v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_inc v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x6d,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_inc v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x6d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_inc v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_inc v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x6d,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_inc v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x6d,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_dec v[4:5], v10, s[8:15] dmask:0x1 tfe
+// SICI:  image_atomic_dec v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x71,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_dec v[4:5], v10, s[8:15] dmask:0x1 tfe ; encoding: [0x00,0x01,0x71,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_dec v[4:6], v10, s[8:15] dmask:0x3 tfe
+// SICI:  image_atomic_dec v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x71,0xf0,0x0a,0x04,0x02,0x00]
+// GFX89: image_atomic_dec v[4:6], v10, s[8:15] dmask:0x3 tfe ; encoding: [0x00,0x03,0x71,0xf0,0x0a,0x04,0x02,0x00]
+
+image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3
+// SICI:  image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 ; encoding: [0x00,0x03,0x40,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 ; encoding: [0x00,0x03,0x44,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_cmpswap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe
+// SICI:  image_atomic_cmpswap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe ; encoding: [0x00,0x03,0x41,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_cmpswap v[4:6], v[192:195], s[28:35] dmask:0x3 tfe ; encoding: [0x00,0x03,0x45,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf
+// SICI:  image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf ; encoding: [0x00,0x0f,0x40,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_cmpswap v[4:7], v[192:195], s[28:35] dmask:0xf ; encoding: [0x00,0x0f,0x44,0xf0,0xc0,0x04,0x07,0x00]
+
+image_atomic_cmpswap v[4:8], v[192:195], s[28:35] dmask:0xf tfe
+// SICI:  image_atomic_cmpswap v[4:8], v[192:195], s[28:35] dmask:0xf tfe ; encoding: [0x00,0x0f,0x41,0xf0,0xc0,0x04,0x07,0x00]
+// GFX89: image_atomic_cmpswap v[4:8], v[192:195], s[28:35] dmask:0xf tfe ; encoding: [0x00,0x0f,0x45,0xf0,0xc0,0x04,0x07,0x00]
+
 // FIXME: This test is incorrect because r128 assumes a 128-bit SRSRC.
 image_atomic_add v10, v6, s[8:15] dmask:0x1 r128
 // SICI: image_atomic_add v10, v6, s[8:15] dmask:0x1 r128 ; encoding: [0x00,0x81,0x44,0xf0,0x06,0x0a,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
index 292af1850db863..0a5bafc55f4d43 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
@@ -195,7 +195,7 @@
 # VI: image_atomic_add v5, v1, s[8:15] dmask:0x7 unorm ; encoding: [0x00,0x17,0x48,0xf0,0x01,0x05,0x02,0x00]
 0x00,0x17,0x48,0xf0,0x01,0x05,0x02,0x00
 
-# VI: image_atomic_add v5, v1, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x48,0xf0,0x01,0x05,0x02,0x00]
+# VI: image_atomic_add v[5:9], v1, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x48,0xf0,0x01,0x05,0x02,0x00]
 0x00,0x1f,0x48,0xf0,0x01,0x05,0x02,0x00
 
 # VI: image_atomic_cmpswap v[5:6], v1, s[8:15] unorm ; encoding: [0x00,0x10,0x44,0xf0,0x01,0x05,0x02,0x00]

From e06e680a97d28dc95d31952a0b200add75941496 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 16:06:14 -0700
Subject: [PATCH 197/230] [RISCV] Replace duplicate trunc-sat-clip tests with
 more interesting tests. NFC (#93737)

For each pair of types, we had 3 identical tests using umin with the
unsigned max value.

This patch replaces two of them with smin+smax cases that can be
implemented with a signed vmax followed by a vnclipu.
---
 .../RISCV/rvv/fixed-vectors-trunc-sat-clip.ll | 118 ++++++++++++------
 .../RISCV/rvv/trunc-sat-clip-sdnode.ll        | 118 ++++++++++++------
 2 files changed, 160 insertions(+), 76 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
index a0d4718e9e851c..414b23ffb582ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
@@ -98,33 +98,45 @@ define void @trunc_sat_u8u16_notopt(ptr %x, ptr %y) {
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u8u16_maxmin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = load <4 x i16>, ptr %x, align 16
-  %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
-  %3 = trunc <4 x i16> %2 to <4 x i8>
-  store <4 x i8> %3, ptr %y, align 8
+  %2 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %1, <4 x i16> zeroinitializer)
+  %3 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %2, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u8u16_minmax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = load <4 x i16>, ptr %x, align 16
-  %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
-  %3 = trunc <4 x i16> %2 to <4 x i8>
-  store <4 x i8> %3, ptr %y, align 8
+  %2 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
+  %3 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %2, <4 x i16> zeroinitializer)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
   ret void
 }
 
@@ -217,33 +229,49 @@ define void @trunc_sat_u16u32_min(ptr %x, ptr %y) {
   ret void
 }
 
-define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
-; CHECK-LABEL: trunc_sat_u16u32_minmax:
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_maxmin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <4 x i32>, ptr %x, align 32
-  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
-  %3 = trunc <4 x i32> %2 to <4 x i16>
-  store <4 x i16> %3, ptr %y, align 16
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+  %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 8
   ret void
 }
 
-define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
-; CHECK-LABEL: trunc_sat_u16u32_maxmin:
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_minmax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 50
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <4 x i32>, ptr %x, align 32
-  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
-  %3 = trunc <4 x i32> %2 to <4 x i16>
-  store <4 x i16> %3, ptr %y, align 16
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %1, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> <i32 50, i32 50, i32 50, i32 50>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 8
   ret void
 }
 
@@ -339,32 +367,46 @@ define void @trunc_sat_u32u64_min(ptr %x, ptr %y) {
 }
 
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u32u64_maxmin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <4 x i64>, ptr %x, align 64
-  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
-  %3 = trunc <4 x i64> %2 to <4 x i32>
-  store <4 x i32> %3, ptr %y, align 32
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> zeroinitializer)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 8
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u32u64_minmax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vse32.v v10, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <4 x i64>, ptr %x, align 64
-  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
-  %3 = trunc <4 x i64> %2 to <4 x i32>
-  store <4 x i32> %3, ptr %y, align 32
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> zeroinitializer)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
index 7886eb162cd702..fcb49c21871919 100644
--- a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
@@ -98,33 +98,45 @@ define void @trunc_sat_u8u16_notopt(ptr %x, ptr %y) {
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u8u16_maxmin:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = load <vscale x 4 x i16>, ptr %x, align 16
-  %2 = tail call <vscale x 4 x i16> @llvm.umin.v4i16(<vscale x 4 x i16> %1, <vscale x 4 x i16> splat (i16 255))
-  %3 = trunc <vscale x 4 x i16> %2 to <vscale x 4 x i8>
-  store <vscale x 4 x i8> %3, ptr %y, align 8
+  %2 = tail call <vscale x 4 x i16> @llvm.smax.v4i16(<vscale x 4 x i16> %1, <vscale x 4 x i16> splat (i16 0))
+  %3 = tail call <vscale x 4 x i16> @llvm.smin.v4i16(<vscale x 4 x i16> %2, <vscale x 4 x i16> splat (i16 255))
+  %4 = trunc <vscale x 4 x i16> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u8u16_minmax:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = load <vscale x 4 x i16>, ptr %x, align 16
-  %2 = tail call <vscale x 4 x i16> @llvm.umin.v4i16(<vscale x 4 x i16> %1, <vscale x 4 x i16> splat (i16 255))
-  %3 = trunc <vscale x 4 x i16> %2 to <vscale x 4 x i8>
-  store <vscale x 4 x i8> %3, ptr %y, align 8
+  %2 = tail call <vscale x 4 x i16> @llvm.smin.v4i16(<vscale x 4 x i16> %1, <vscale x 4 x i16> splat (i16 255))
+  %3 = tail call <vscale x 4 x i16> @llvm.smax.v4i16(<vscale x 4 x i16> %2, <vscale x 4 x i16> splat (i16 0))
+  %4 = trunc <vscale x 4 x i16> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
   ret void
 }
 
@@ -217,33 +229,49 @@ define void @trunc_sat_u16u32_min(ptr %x, ptr %y) {
   ret void
 }
 
-define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
-; CHECK-LABEL: trunc_sat_u16u32_minmax:
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_maxmin:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl2re32.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vs1r.v v10, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i32>, ptr %x, align 32
-  %2 = tail call <vscale x 4 x i32> @llvm.umin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 65535))
-  %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
-  store <vscale x 4 x i16> %3, ptr %y, align 16
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 1))
+  %3 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 65535))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 8
   ret void
 }
 
-define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
-; CHECK-LABEL: trunc_sat_u16u32_maxmin:
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_minmax:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl2re32.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 50
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vs1r.v v10, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i32>, ptr %x, align 32
-  %2 = tail call <vscale x 4 x i32> @llvm.umin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 65535))
-  %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
-  store <vscale x 4 x i16> %3, ptr %y, align 16
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 65535))
+  %3 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 50))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 8
   ret void
 }
 
@@ -339,32 +367,46 @@ define void @trunc_sat_u32u64_min(ptr %x, ptr %y) {
 }
 
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u32u64_maxmin:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl4re64.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vnclipu.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
 ; CHECK-NEXT:    vs2r.v v12, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i64>, ptr %x, align 64
-  %2 = tail call <vscale x 4 x i64> @llvm.umin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 4294967295))
-  %3 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
-  store <vscale x 4 x i32> %3, ptr %y, align 32
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 0))
+  %3 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 4294967295))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i32>
+  store <vscale x 4 x i32> %4, ptr %y, align 8
   ret void
 }
 
+; FIXME: This can be a signed vmax followed by vnclipu.
 define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
 ; CHECK-LABEL: trunc_sat_u32u64_minmax:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl4re64.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vnclipu.wi v12, v8, 0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
 ; CHECK-NEXT:    vs2r.v v12, (a1)
 ; CHECK-NEXT:    ret
-  %1 = load <vscale x 4 x i64>, ptr %x, align 64
-  %2 = tail call <vscale x 4 x i64> @llvm.umin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 4294967295))
-  %3 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
-  store <vscale x 4 x i32> %3, ptr %y, align 32
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 4294967295))
+  %3 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 0))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i32>
+  store <vscale x 4 x i32> %4, ptr %y, align 8
   ret void
 }

From b12f81b53ad6c3e1004f32eecbf4083d87731fbc Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 29 May 2024 16:09:59 -0700
Subject: [PATCH 198/230] Introduce DIExpression::foldConstantMath() (#71718)

DIExpressions can get very long and have a lot of redundant operations.
This function uses simple pattern matching to fold constant math that
can be evaluated at compile time.

The hope is that other people can contribute other patterns as well.

I also couldn't see a good way of combining this with
`DIExpression::constantFold` so it stands alone.

This is part of a stack of patches and comes after
https://github.com/llvm/llvm-project/pull/69768
https://github.com/llvm/llvm-project/pull/71717
---
 llvm/include/llvm/IR/DebugInfoMetadata.h |   5 +
 llvm/lib/IR/CMakeLists.txt               |   1 +
 llvm/lib/IR/DIExpressionOptimizer.cpp    | 378 +++++++++++++++++++
 llvm/lib/IR/DebugInfoMetadata.cpp        |   1 -
 llvm/unittests/IR/MetadataTest.cpp       | 457 +++++++++++++++++++++++
 5 files changed, 841 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/IR/DIExpressionOptimizer.cpp

diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 555bd623ad9ef7..18873a551595ae 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -3121,6 +3121,11 @@ class DIExpression : public MDNode {
   /// expression and constant on failure.
   std::pair<DIExpression *, const ConstantInt *>
   constantFold(const ConstantInt *CI);
+
+  /// Try to shorten an expression with constant math operations that can be
+  /// evaluated at compile time. Returns a new expression on success, or the old
+  /// expression if there is nothing to be reduced.
+  DIExpression *foldConstantMath();
 };
 
 inline bool operator==(const DIExpression::FragmentInfo &A,
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index b5fb7409d8e88e..20f169913087a4 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_component_library(LLVMCore
   DataLayout.cpp
   DebugInfo.cpp
   DebugInfoMetadata.cpp
+  DIExpressionOptimizer.cpp
   DebugProgramInstruction.cpp
   DebugLoc.cpp
   DiagnosticHandler.cpp
diff --git a/llvm/lib/IR/DIExpressionOptimizer.cpp b/llvm/lib/IR/DIExpressionOptimizer.cpp
new file mode 100644
index 00000000000000..2bb8eac348c8e9
--- /dev/null
+++ b/llvm/lib/IR/DIExpressionOptimizer.cpp
@@ -0,0 +1,378 @@
+//===- DIExpressionOptimizer.cpp - Constant folding of DIExpressions ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions to constant fold DIExpressions. Which were
+// declared in DIExpressionOptimizer.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+using namespace llvm;
+
+/// Returns true if the Op is a DW_OP_constu.
+static std::optional<uint64_t> isConstantVal(DIExpression::ExprOperand Op) {
+  if (Op.getOp() == dwarf::DW_OP_constu)
+    return Op.getArg(0);
+  return std::nullopt;
+}
+
+/// Returns true if an operation and operand result in a No Op.
+static bool isNeutralElement(uint64_t Op, uint64_t Val) {
+  switch (Op) {
+  case dwarf::DW_OP_plus:
+  case dwarf::DW_OP_minus:
+  case dwarf::DW_OP_shl:
+  case dwarf::DW_OP_shr:
+    return Val == 0;
+  case dwarf::DW_OP_mul:
+  case dwarf::DW_OP_div:
+    return Val == 1;
+  default:
+    return false;
+  }
+}
+
+/// Try to fold \p Const1 and \p Const2 by applying \p Operator and returning
+/// the result, if there is an overflow, return a std::nullopt.
+static std::optional<uint64_t>
+foldOperationIfPossible(uint64_t Const1, uint64_t Const2,
+                        dwarf::LocationAtom Operator) {
+
+  bool ResultOverflowed;
+  switch (Operator) {
+  case dwarf::DW_OP_plus: {
+    auto Result = SaturatingAdd(Const1, Const2, &ResultOverflowed);
+    if (ResultOverflowed)
+      return std::nullopt;
+    return Result;
+  }
+  case dwarf::DW_OP_minus: {
+    if (Const1 < Const2)
+      return std::nullopt;
+    return Const1 - Const2;
+  }
+  case dwarf::DW_OP_shl: {
+    if ((uint64_t)countl_zero(Const1) < Const2)
+      return std::nullopt;
+    return Const1 << Const2;
+  }
+  case dwarf::DW_OP_shr: {
+    if ((uint64_t)countr_zero(Const1) < Const2)
+      return std::nullopt;
+    return Const1 >> Const2;
+  }
+  case dwarf::DW_OP_mul: {
+    auto Result = SaturatingMultiply(Const1, Const2, &ResultOverflowed);
+    if (ResultOverflowed)
+      return std::nullopt;
+    return Result;
+  }
+  case dwarf::DW_OP_div: {
+    if (Const2)
+      return Const1 / Const2;
+    return std::nullopt;
+  }
+  default:
+    return std::nullopt;
+  }
+}
+
+/// Returns true if the two operations \p Operator1 and \p Operator2 are
+/// commutative and can be folded.
+static bool operationsAreFoldableAndCommutative(dwarf::LocationAtom Operator1,
+                                                dwarf::LocationAtom Operator2) {
+  return Operator1 == Operator2 &&
+         (Operator1 == dwarf::DW_OP_plus || Operator1 == dwarf::DW_OP_mul);
+}
+
+/// Consume one operator and its operand(s).
+static void consumeOneOperator(DIExpressionCursor &Cursor, uint64_t &Loc,
+                               const DIExpression::ExprOperand &Op) {
+  Cursor.consume(1);
+  Loc = Loc + Op.getSize();
+}
+
+/// Reset the Cursor to the beginning of the WorkingOps.
+void startFromBeginning(uint64_t &Loc, DIExpressionCursor &Cursor,
+                        ArrayRef<uint64_t> WorkingOps) {
+  Cursor.assignNewExpr(WorkingOps);
+  Loc = 0;
+}
+
+/// This function will canonicalize:
+/// 1. DW_OP_plus_uconst to DW_OP_constu <const-val> DW_OP_plus
+/// 2. DW_OP_lit<n> to DW_OP_constu <n>
+static SmallVector<uint64_t>
+canonicalizeDwarfOperations(ArrayRef<uint64_t> WorkingOps) {
+  DIExpressionCursor Cursor(WorkingOps);
+  uint64_t Loc = 0;
+  SmallVector<uint64_t> ResultOps;
+  while (Loc < WorkingOps.size()) {
+    auto Op = Cursor.peek();
+    /// Expression has no operations, break.
+    if (!Op)
+      break;
+    auto OpRaw = Op->getOp();
+
+    if (OpRaw >= dwarf::DW_OP_lit0 && OpRaw <= dwarf::DW_OP_lit31) {
+      ResultOps.push_back(dwarf::DW_OP_constu);
+      ResultOps.push_back(OpRaw - dwarf::DW_OP_lit0);
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      continue;
+    }
+    if (OpRaw == dwarf::DW_OP_plus_uconst) {
+      ResultOps.push_back(dwarf::DW_OP_constu);
+      ResultOps.push_back(Op->getArg(0));
+      ResultOps.push_back(dwarf::DW_OP_plus);
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      continue;
+    }
+    uint64_t PrevLoc = Loc;
+    consumeOneOperator(Cursor, Loc, *Cursor.peek());
+    ResultOps.append(WorkingOps.begin() + PrevLoc, WorkingOps.begin() + Loc);
+  }
+  return ResultOps;
+}
+
+/// This function will convert:
+/// 1. DW_OP_constu <const-val> DW_OP_plus to DW_OP_plus_uconst
+/// 2. DW_OP_constu, 0 to DW_OP_lit0
+static SmallVector<uint64_t>
+optimizeDwarfOperations(ArrayRef<uint64_t> WorkingOps) {
+  DIExpressionCursor Cursor(WorkingOps);
+  uint64_t Loc = 0;
+  SmallVector<uint64_t> ResultOps;
+  while (Loc < WorkingOps.size()) {
+    auto Op1 = Cursor.peek();
+    /// Expression has no operations, exit.
+    if (!Op1)
+      break;
+    auto Op1Raw = Op1->getOp();
+
+    if (Op1Raw == dwarf::DW_OP_constu && Op1->getArg(0) == 0) {
+      ResultOps.push_back(dwarf::DW_OP_lit0);
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      continue;
+    }
+
+    auto Op2 = Cursor.peekNext();
+    /// Expression has no more operations, copy into ResultOps and exit.
+    if (!Op2) {
+      uint64_t PrevLoc = Loc;
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      ResultOps.append(WorkingOps.begin() + PrevLoc, WorkingOps.begin() + Loc);
+      break;
+    }
+    auto Op2Raw = Op2->getOp();
+
+    if (Op1Raw == dwarf::DW_OP_constu && Op2Raw == dwarf::DW_OP_plus) {
+      ResultOps.push_back(dwarf::DW_OP_plus_uconst);
+      ResultOps.push_back(Op1->getArg(0));
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      consumeOneOperator(Cursor, Loc, *Cursor.peek());
+      continue;
+    }
+    uint64_t PrevLoc = Loc;
+    consumeOneOperator(Cursor, Loc, *Cursor.peek());
+    ResultOps.append(WorkingOps.begin() + PrevLoc, WorkingOps.begin() + Loc);
+  }
+  return ResultOps;
+}
+
+/// {DW_OP_constu, 0, DW_OP_[plus, minus, shl, shr]} -> {}
+/// {DW_OP_constu, 1, DW_OP_[mul, div]} -> {}
+static bool tryFoldNoOpMath(uint64_t Const1,
+                            ArrayRef<DIExpression::ExprOperand> Ops,
+                            uint64_t &Loc, DIExpressionCursor &Cursor,
+                            SmallVectorImpl<uint64_t> &WorkingOps) {
+
+  if (isNeutralElement(Ops[1].getOp(), Const1)) {
+    WorkingOps.erase(WorkingOps.begin() + Loc, WorkingOps.begin() + Loc + 3);
+    startFromBeginning(Loc, Cursor, WorkingOps);
+    return true;
+  }
+  return false;
+}
+
+/// {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_[plus,
+/// minus, mul, div, shl, shr] -> {DW_OP_constu, Const1 [+, -, *, /, <<, >>]
+/// Const2}
+static bool tryFoldConstants(uint64_t Const1,
+                             ArrayRef<DIExpression::ExprOperand> Ops,
+                             uint64_t &Loc, DIExpressionCursor &Cursor,
+                             SmallVectorImpl<uint64_t> &WorkingOps) {
+
+  auto Const2 = isConstantVal(Ops[1]);
+  if (!Const2)
+    return false;
+
+  auto Result = foldOperationIfPossible(
+      Const1, *Const2, static_cast<dwarf::LocationAtom>(Ops[2].getOp()));
+  if (!Result) {
+    consumeOneOperator(Cursor, Loc, Ops[0]);
+    return true;
+  }
+  WorkingOps.erase(WorkingOps.begin() + Loc + 2, WorkingOps.begin() + Loc + 5);
+  WorkingOps[Loc] = dwarf::DW_OP_constu;
+  WorkingOps[Loc + 1] = *Result;
+  startFromBeginning(Loc, Cursor, WorkingOps);
+  return true;
+}
+
+/// {DW_OP_constu, Const1, DW_OP_[plus, mul], DW_OP_constu, Const2,
+/// DW_OP_[plus, mul]} -> {DW_OP_constu, Const1 [+, *] Const2, DW_OP_[plus,
+/// mul]}
+static bool tryFoldCommutativeMath(uint64_t Const1,
+                                   ArrayRef<DIExpression::ExprOperand> Ops,
+                                   uint64_t &Loc, DIExpressionCursor &Cursor,
+                                   SmallVectorImpl<uint64_t> &WorkingOps) {
+
+  auto Const2 = isConstantVal(Ops[2]);
+  auto Operand1 = static_cast<dwarf::LocationAtom>(Ops[1].getOp());
+  auto Operand2 = static_cast<dwarf::LocationAtom>(Ops[3].getOp());
+
+  if (!Const2 || !operationsAreFoldableAndCommutative(Operand1, Operand2))
+    return false;
+
+  auto Result = foldOperationIfPossible(Const1, *Const2, Operand1);
+  if (!Result) {
+    consumeOneOperator(Cursor, Loc, Ops[0]);
+    return true;
+  }
+  WorkingOps.erase(WorkingOps.begin() + Loc + 3, WorkingOps.begin() + Loc + 6);
+  WorkingOps[Loc] = dwarf::DW_OP_constu;
+  WorkingOps[Loc + 1] = *Result;
+  startFromBeginning(Loc, Cursor, WorkingOps);
+  return true;
+}
+
+/// {DW_OP_constu, Const1, DW_OP_[plus, mul], DW_OP_LLVM_arg, Arg1,
+/// DW_OP_[plus, mul], DW_OP_constu, Const2, DW_OP_[plus, mul]} ->
+/// {DW_OP_constu, Const1 [+, *] Const2, DW_OP_[plus, mul], DW_OP_LLVM_arg,
+/// Arg1, DW_OP_[plus, mul]}
+static bool tryFoldCommutativeMathWithArgInBetween(
+    uint64_t Const1, ArrayRef<DIExpression::ExprOperand> Ops, uint64_t &Loc,
+    DIExpressionCursor &Cursor, SmallVectorImpl<uint64_t> &WorkingOps) {
+
+  auto Const2 = isConstantVal(Ops[4]);
+  auto Operand1 = static_cast<dwarf::LocationAtom>(Ops[1].getOp());
+  auto Operand2 = static_cast<dwarf::LocationAtom>(Ops[3].getOp());
+  auto Operand3 = static_cast<dwarf::LocationAtom>(Ops[5].getOp());
+
+  if (!Const2 || Ops[2].getOp() != dwarf::DW_OP_LLVM_arg ||
+      !operationsAreFoldableAndCommutative(Operand1, Operand2) ||
+      !operationsAreFoldableAndCommutative(Operand2, Operand3))
+    return false;
+
+  auto Result = foldOperationIfPossible(Const1, *Const2, Operand1);
+  if (!Result) {
+    consumeOneOperator(Cursor, Loc, Ops[0]);
+    return true;
+  }
+  WorkingOps.erase(WorkingOps.begin() + Loc + 6, WorkingOps.begin() + Loc + 9);
+  WorkingOps[Loc] = dwarf::DW_OP_constu;
+  WorkingOps[Loc + 1] = *Result;
+  startFromBeginning(Loc, Cursor, WorkingOps);
+  return true;
+}
+
+DIExpression *DIExpression::foldConstantMath() {
+
+  SmallVector<uint64_t, 8> WorkingOps(Elements.begin(), Elements.end());
+  uint64_t Loc = 0;
+  SmallVector<uint64_t> ResultOps = canonicalizeDwarfOperations(WorkingOps);
+  DIExpressionCursor Cursor(ResultOps);
+  SmallVector<DIExpression::ExprOperand, 8> Ops;
+
+  // Iterate over all Operations in a DIExpression to match the smallest pattern
+  // that can be folded.
+  while (Loc < ResultOps.size()) {
+    Ops.clear();
+
+    auto Op = Cursor.peek();
+    // Expression has no operations, exit.
+    if (!Op)
+      break;
+
+    auto Const1 = isConstantVal(*Op);
+
+    if (!Const1) {
+      // Early exit, all of the following patterns start with a constant value.
+      consumeOneOperator(Cursor, Loc, *Op);
+      continue;
+    }
+
+    Ops.push_back(*Op);
+
+    Op = Cursor.peekNext();
+    // All following patterns require at least 2 Operations, exit.
+    if (!Op)
+      break;
+
+    Ops.push_back(*Op);
+
+    // Try to fold a constant no-op, such as {+ 0}
+    if (tryFoldNoOpMath(*Const1, Ops, Loc, Cursor, ResultOps))
+      continue;
+
+    Op = Cursor.peekNextN(2);
+    // Op[1] could still match a pattern, skip iteration.
+    if (!Op) {
+      consumeOneOperator(Cursor, Loc, Ops[0]);
+      continue;
+    }
+
+    Ops.push_back(*Op);
+
+    // Try to fold a pattern of two constants such as {C1 + C2}.
+    if (tryFoldConstants(*Const1, Ops, Loc, Cursor, ResultOps))
+      continue;
+
+    Op = Cursor.peekNextN(3);
+    // Op[1] and Op[2] could still match a pattern, skip iteration.
+    if (!Op) {
+      consumeOneOperator(Cursor, Loc, Ops[0]);
+      continue;
+    }
+
+    Ops.push_back(*Op);
+
+    // Try to fold commutative constant math, such as {C1 + C2 +}.
+    if (tryFoldCommutativeMath(*Const1, Ops, Loc, Cursor, ResultOps))
+      continue;
+
+    Op = Cursor.peekNextN(4);
+    if (!Op) {
+      consumeOneOperator(Cursor, Loc, Ops[0]);
+      continue;
+    }
+
+    Ops.push_back(*Op);
+    Op = Cursor.peekNextN(5);
+    if (!Op) {
+      consumeOneOperator(Cursor, Loc, Ops[0]);
+      continue;
+    }
+
+    Ops.push_back(*Op);
+
+    // Try to fold commutative constant math with an LLVM_Arg in between, such
+    // as {C1 + Arg + C2 +}.
+    if (tryFoldCommutativeMathWithArgInBetween(*Const1, Ops, Loc, Cursor,
+                                               ResultOps))
+      continue;
+
+    consumeOneOperator(Cursor, Loc, Ops[0]);
+  }
+  ResultOps = optimizeDwarfOperations(ResultOps);
+  auto *Result = DIExpression::get(getContext(), ResultOps);
+  assert(Result->isValid() && "concatenated expression is not valid");
+  return Result;
+}
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 570515505607fb..229ee2bd0164c6 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1880,7 +1880,6 @@ DIExpression *DIExpression::append(const DIExpression *Expr,
     }
     Op.appendToVector(NewOps);
   }
-
   NewOps.append(Ops.begin(), Ops.end());
   auto *result = DIExpression::get(Expr->getContext(), NewOps);
   assert(result->isValid() && "concatenated expression is not valid");
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index 4c2e5f77a54038..9647ac8c439666 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -3153,6 +3153,463 @@ TEST_F(DIExpressionTest, get) {
   EXPECT_EQ(N0WithPrependedOps, N2);
 }
 
+TEST_F(DIExpressionTest, Fold) {
+
+  // Remove a No-op DW_OP_plus_uconst from an expression.
+  SmallVector<uint64_t, 8> Ops = {dwarf::DW_OP_plus_uconst, 0};
+  auto *Expr = DIExpression::get(Context, Ops);
+  auto *E = Expr->foldConstantMath();
+  SmallVector<uint64_t, 8> ResOps;
+  auto *EmptyExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op add from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op subtract from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_minus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op shift left from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_shl);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op shift right from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_shr);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op multiply from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(1);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Remove a No-op divide from an expression.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(1);
+  Ops.push_back(dwarf::DW_OP_div);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  EXPECT_EQ(E, EmptyExpr);
+
+  // Test fold {DW_OP_plus_uconst, Const1, DW_OP_plus_uconst, Const2} ->
+  // {DW_OP_plus_uconst, Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(3);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(5);
+  auto *ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_plus_uconst, Const2} -> {DW_OP_constu,
+  // Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(3);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(5);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_plus} ->
+  // {DW_OP_constu, Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(10);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_minus} ->
+  // {DW_OP_constu, Const1 - Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_minus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(6);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_mul} ->
+  // {DW_OP_constu, Const1 * Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(16);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_div} ->
+  // {DW_OP_constu, Const1 / Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_div);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(4);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_shl} ->
+  // {DW_OP_constu, Const1 << Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_shl);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(32);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_constu, Const2, DW_OP_shr} ->
+  // {DW_OP_constu, Const1 >> Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_shr);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(2);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_plus_uconst, Const1, DW_OP_constu, Const2, DW_OP_plus} ->
+  // {DW_OP_plus_uconst, Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_plus, DW_OP_plus_uconst, Const2} ->
+  // {DW_OP_plus_uconst, Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(2);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_plus, DW_OP_constu, Const2, DW_OP_plus}
+  // -> {DW_OP_plus_uconst, Const1 + Const2}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_mul, DW_OP_constu, Const2, DW_OP_mul} ->
+  // {DW_OP_constu, Const1 * Const2, DW_OP_mul}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(16);
+  ResOps.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_plus_uconst, Const1, DW_OP_plus, DW_OP_LLVM_arg, Arg,
+  // DW_OP_plus, DW_OP_constu, Const2, DW_OP_plus} -> {DW_OP_plus_uconst, Const1
+  // + Const2, DW_OP_LLVM_arg, Arg, DW_OP_plus}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_LLVM_arg);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResOps.push_back(dwarf::DW_OP_LLVM_arg);
+  ResOps.push_back(0);
+  ResOps.push_back(dwarf::DW_OP_plus);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_plus, DW_OP_LLVM_arg, Arg, DW_OP_plus,
+  // DW_OP_plus_uconst, Const2} -> {DW_OP_constu, Const1 + Const2, DW_OP_plus,
+  // DW_OP_LLVM_arg, Arg, DW_OP_plus}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_LLVM_arg);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(2);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResOps.push_back(dwarf::DW_OP_LLVM_arg);
+  ResOps.push_back(0);
+  ResOps.push_back(dwarf::DW_OP_plus);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_plus, DW_OP_LLVM_arg, Arg, DW_OP_plus,
+  // DW_OP_constu, Const2, DW_OP_plus} -> {DW_OP_constu, Const1 + Const2,
+  // DW_OP_plus, DW_OP_LLVM_arg, Arg, DW_OP_plus}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_LLVM_arg);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_plus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(10);
+  ResOps.push_back(dwarf::DW_OP_LLVM_arg);
+  ResOps.push_back(0);
+  ResOps.push_back(dwarf::DW_OP_plus);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test {DW_OP_constu, Const1, DW_OP_mul, DW_OP_LLVM_arg, Arg, DW_OP_mul,
+  // DW_OP_constu, Const2, DW_OP_mul} -> {DW_OP_constu, Const1 * Const2,
+  // DW_OP_mul, DW_OP_LLVM_arg, Arg, DW_OP_mul}
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(8);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Ops.push_back(dwarf::DW_OP_LLVM_arg);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(16);
+  ResOps.push_back(dwarf::DW_OP_mul);
+  ResOps.push_back(dwarf::DW_OP_LLVM_arg);
+  ResOps.push_back(0);
+  ResOps.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test an overflow addition.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(UINT64_MAX);
+  Ops.push_back(dwarf::DW_OP_plus_uconst);
+  Ops.push_back(2);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(UINT64_MAX);
+  ResOps.push_back(dwarf::DW_OP_plus_uconst);
+  ResOps.push_back(2);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test an underflow subtraction.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(1);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_minus);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(1);
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(2);
+  ResOps.push_back(dwarf::DW_OP_minus);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test a left shift greater than 64.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(1);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(65);
+  Ops.push_back(dwarf::DW_OP_shl);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(1);
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(65);
+  ResOps.push_back(dwarf::DW_OP_shl);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test a right shift greater than 64.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(1);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(65);
+  Ops.push_back(dwarf::DW_OP_shr);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(1);
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(65);
+  ResOps.push_back(dwarf::DW_OP_shr);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test an overflow multiplication.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(UINT64_MAX);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(UINT64_MAX);
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(2);
+  ResOps.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+
+  // Test a divide by 0.
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_div);
+  Expr = DIExpression::get(Context, Ops);
+  E = Expr->foldConstantMath();
+  ResOps.clear();
+  ResOps.push_back(dwarf::DW_OP_constu);
+  ResOps.push_back(2);
+  ResOps.push_back(dwarf::DW_OP_lit0);
+  ResOps.push_back(dwarf::DW_OP_div);
+  ResExpr = DIExpression::get(Context, ResOps);
+  EXPECT_EQ(E, ResExpr);
+}
+
 TEST_F(DIExpressionTest, isValid) {
 #define EXPECT_VALID(...)                                                      \
   do {                                                                         \

From fb607c9019dcfb3bec4c4ab84b67112b52fbe083 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 29 May 2024 16:16:04 -0700
Subject: [PATCH 199/230] [MTE] add tests for stack tagging debug info (#93743)

These are equivalent to the tests in HWASan of the same name.
---
 .../stack-tagging-dbg-assign-tag-offset.ll    | 60 +++++++++++++++++++
 .../stack-tagging-dbg-declare-tag-offset.ll   | 55 +++++++++++++++++
 ...tack-tagging-dbg-value-tag-offset-nopad.ll | 59 ++++++++++++++++++
 .../stack-tagging-dbg-value-tag-offset.ll     | 59 ++++++++++++++++++
 4 files changed, 233 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-dbg-assign-tag-offset.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-dbg-declare-tag-offset.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset-nopad.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset.ll

diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-assign-tag-offset.ll
new file mode 100644
index 00000000000000..4845bd9936e6b8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-assign-tag-offset.ll
@@ -0,0 +1,60 @@
+; RUN: opt -aarch64-stack-tagging -stack-tagging-record-stack-history=instr -S -o - %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators  -aarch64-stack-tagging -stack-tagging-record-stack-history=instr -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_memtag
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 4)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 5)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_memtag }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg-declare-tag-offset.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-declare-tag-offset.ll
new file mode 100644
index 00000000000000..0655eaee34e91f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-declare-tag-offset.ll
@@ -0,0 +1,55 @@
+; RUN: opt -aarch64-stack-tagging -stack-tagging-record-stack-history=instr -S -o - %s | FileCheck %s
+
+;; Also test with RemoveDIs to verify that debug intrinsics immediately
+;; preceding an alloca (or other instruction of interest to stack tagging) will
+;; be correctly processed.
+; RUN: opt --try-experimental-debuginfo-iterators -aarch64-stack-tagging -stack-tagging-record-stack-history=instr -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+define void @f() sanitize_memtag !dbg !6 {
+entry:
+  %nodebug0 = alloca ptr
+  %nodebug1 = alloca ptr
+  %nodebug2 = alloca ptr
+  %nodebug3 = alloca ptr
+  %a = alloca ptr
+  ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 4)
+  call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
+  ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 4)
+  call void @llvm.dbg.declare(metadata ptr %a, metadata !12, metadata !DIExpression()), !dbg !14
+  %b = alloca ptr
+  ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 5)
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !13, metadata !DIExpression()), !dbg !14
+  ; CHECK: @llvm.dbg.declare{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 5)
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !13, metadata !DIExpression()), !dbg !14
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags:
+DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !11)
+!11 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!12 = !DILocalVariable(name: "a", scope: !6, file: !1, line: 1, type: !9)
+!13 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 1, type: !9)
+!14 = !DILocation(line: 1, column: 29, scope: !6)
+!15 = !DILocation(line: 1, column: 37, scope: !6)
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset-nopad.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset-nopad.ll
new file mode 100644
index 00000000000000..470018759af63f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset-nopad.ll
@@ -0,0 +1,59 @@
+; RUN: opt -aarch64-stack-tagging -stack-tagging-record-stack-history=instr  -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android10000"
+
+define dso_local void @f() sanitize_memtag !dbg !14 {
+  %a1 = alloca i128, align 4
+  %a2 = alloca i128, align 4
+; CHECK: call void @llvm.dbg.value(metadata i128 1, {{.*}}, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i128 1, metadata !20, metadata !DIExpression()), !dbg !22
+  store i128 1, ptr %a2, align 4, !dbg !23, !tbaa !24
+; CHECK: call void @llvm.dbg.value(metadata ptr %a1, {{.*}}, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0, DW_OP_deref))
+  call void @llvm.dbg.value(metadata ptr %a1, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(ptr nonnull %a1), !dbg !28
+; CHECK: call void @llvm.dbg.value(metadata ptr %a2, {{.*}}, metadata !DIExpression(DW_OP_LLVM_tag_offset, 1, DW_OP_deref))
+  call void @llvm.dbg.value(metadata ptr %a2, metadata !20, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(ptr nonnull %a2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset.ll
new file mode 100644
index 00000000000000..6b3e34c265a1ee
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg-value-tag-offset.ll
@@ -0,0 +1,59 @@
+; RUN: opt -aarch64-stack-tagging -stack-tagging-record-stack-history=instr -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android10000"
+
+define dso_local void @f() sanitize_memtag !dbg !14 {
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+; CHECK: call void @llvm.dbg.value(metadata i32 1, {{.*}}, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i32 1, metadata !20, metadata !DIExpression()), !dbg !22
+  store i32 1, ptr %a2, align 4, !dbg !23, !tbaa !24
+; CHECK: call void @llvm.dbg.value(metadata ptr %a1, {{.*}} metadata !DIExpression(DW_OP_LLVM_tag_offset, 0, DW_OP_deref))
+  call void @llvm.dbg.value(metadata ptr %a1, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(ptr nonnull %a1), !dbg !28
+; CHECK: call void @llvm.dbg.value(metadata ptr %a2, {{.*}} metadata !DIExpression(DW_OP_LLVM_tag_offset, 1, DW_OP_deref))
+  call void @llvm.dbg.value(metadata ptr %a2, metadata !20, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(ptr nonnull %a2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)

From 69969c725b0987a12a27a8ac787f06be672a4d09 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 29 May 2024 16:19:53 -0700
Subject: [PATCH 200/230] Use DIExpression::foldConstantMath() at the result of
 an append() (#71719)

This patch uses `DIExpression::foldConstantMath()` at the end of a
`DIExpression::append()`. Which should help in reducing the size of
DIExpressions that grow because of salvaging debug info

This is part of a stack of patches and comes after:
https://github.com/llvm/llvm-project/pull/69768
https://github.com/llvm/llvm-project/pull/71717
https://github.com/llvm/llvm-project/pull/71718
---
 llvm/lib/IR/DebugInfoMetadata.cpp             |   3 +-
 llvm/test/Bitcode/upgrade-dbg-addr.ll         |   2 +-
 .../MIR/AArch64/dbgcall-site-expr-chain.mir   |   4 +-
 llvm/unittests/IR/MetadataTest.cpp            | 138 ++++++++++++++++++
 4 files changed, 143 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 229ee2bd0164c6..9bd1d7880c9f81 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1881,7 +1881,8 @@ DIExpression *DIExpression::append(const DIExpression *Expr,
     Op.appendToVector(NewOps);
   }
   NewOps.append(Ops.begin(), Ops.end());
-  auto *result = DIExpression::get(Expr->getContext(), NewOps);
+  auto *result =
+      DIExpression::get(Expr->getContext(), NewOps)->foldConstantMath();
   assert(result->isValid() && "concatenated expression is not valid");
   return result;
 }
diff --git a/llvm/test/Bitcode/upgrade-dbg-addr.ll b/llvm/test/Bitcode/upgrade-dbg-addr.ll
index 06a411c2c83486..de35609713f933 100644
--- a/llvm/test/Bitcode/upgrade-dbg-addr.ll
+++ b/llvm/test/Bitcode/upgrade-dbg-addr.ll
@@ -9,7 +9,7 @@ entry:
   %num.addr = alloca i32, align 4
   store i32 %num, ptr %num.addr, align 4
   ; CHECK-NOT: call void @llvm.dbg.addr
-  ; CHECK: call void @llvm.dbg.value(metadata ptr %num.addr, metadata ![[#]], metadata !DIExpression(DW_OP_plus_uconst, 0, DW_OP_deref))
+  ; CHECK: call void @llvm.dbg.value(metadata ptr %num.addr, metadata ![[#]], metadata !DIExpression(DW_OP_deref))
   call void @llvm.dbg.addr(metadata ptr %num.addr, metadata !16, metadata !DIExpression(DW_OP_plus_uconst, 0)), !dbg !17
   %0 = load i32, ptr %num.addr, align 4
   ret i32 %0
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
index cb3e780664404c..02f4ce1200ca17 100644
--- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-expr-chain.mir
@@ -105,7 +105,7 @@ body:             |
 
 # CHECK: DW_TAG_GNU_call_site_parameter
 # CHECK-NEXT: DW_AT_location (DW_OP_reg2 W2)
-# CHECK-NEXT: DW_AT_GNU_call_site_value (DW_OP_breg19 W19+700, DW_OP_plus_uconst 0x9, DW_OP_plus_uconst 0x50)
+# CHECK-NEXT: DW_AT_GNU_call_site_value (DW_OP_breg19 W19+789)
 
 # CHECK: DW_TAG_GNU_call_site_parameter
 # CHECK-NEXT: DW_AT_location (DW_OP_reg1 W1)
@@ -113,4 +113,4 @@ body:             |
 
 # CHECK: DW_TAG_GNU_call_site_parameter
 # CHECK-NEXT: DW_AT_location (DW_OP_reg0 W0)
-# CHECK-NEXT: DW_AT_GNU_call_site_value (DW_OP_breg19 W19+100, DW_OP_plus_uconst 0x17)
+# CHECK-NEXT: DW_AT_GNU_call_site_value (DW_OP_breg19 W19+123)
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index 9647ac8c439666..3f766a414f08f2 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -3610,6 +3610,144 @@ TEST_F(DIExpressionTest, Fold) {
   EXPECT_EQ(E, ResExpr);
 }
 
+TEST_F(DIExpressionTest, Append) {
+  // Test appending a {dwarf::DW_OP_constu, <const>, DW_OP_plus} to a DW_OP_plus
+  // expression
+  SmallVector<uint64_t, 8> Ops = {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_constu,
+                                  2, dwarf::DW_OP_plus};
+  auto *Expr = DIExpression::get(Context, Ops);
+  SmallVector<uint64_t, 8> AppendOps = {dwarf::DW_OP_constu, 3,
+                                        dwarf::DW_OP_plus};
+  auto *AppendExpr = DIExpression::append(Expr, AppendOps);
+  SmallVector<uint64_t, 8> OpsRes = {dwarf::DW_OP_LLVM_arg, 0,
+                                     dwarf::DW_OP_plus_uconst, 5};
+  auto *ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_plus_uconst, <const>} to a DW_OP_plus
+  // expression uint64_t PlusUConstOps[] = {dwarf::DW_OP_plus_uconst, 3};
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_plus_uconst);
+  AppendOps.push_back(3);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_plus_uconst);
+  OpsRes.push_back(5);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 0, DW_OP_plus} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(0);
+  AppendOps.push_back(dwarf::DW_OP_plus);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_plus_uconst);
+  OpsRes.push_back(2);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 0, DW_OP_minus} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(0);
+  AppendOps.push_back(dwarf::DW_OP_minus);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_plus_uconst);
+  OpsRes.push_back(2);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 0, DW_OP_shl} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(0);
+  AppendOps.push_back(dwarf::DW_OP_shl);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_plus_uconst);
+  OpsRes.push_back(2);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 0, DW_OP_shr} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(0);
+  AppendOps.push_back(dwarf::DW_OP_shr);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_plus_uconst);
+  OpsRes.push_back(2);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, <const>, DW_OP_mul} to a DW_OP_mul
+  // expression
+  Ops.clear();
+  Ops.push_back(dwarf::DW_OP_LLVM_arg);
+  Ops.push_back(0);
+  Ops.push_back(dwarf::DW_OP_constu);
+  Ops.push_back(2);
+  Ops.push_back(dwarf::DW_OP_mul);
+  Expr = DIExpression::get(Context, Ops);
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(3);
+  AppendOps.push_back(dwarf::DW_OP_mul);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_constu);
+  OpsRes.push_back(6);
+  OpsRes.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 1, DW_OP_mul} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(1);
+  AppendOps.push_back(dwarf::DW_OP_mul);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_constu);
+  OpsRes.push_back(2);
+  OpsRes.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+
+  // Test appending a {dwarf::DW_OP_constu, 1, DW_OP_div} to an expression
+  AppendOps.clear();
+  AppendOps.push_back(dwarf::DW_OP_constu);
+  AppendOps.push_back(1);
+  AppendOps.push_back(dwarf::DW_OP_div);
+  AppendExpr = DIExpression::append(Expr, AppendOps);
+  OpsRes.clear();
+  OpsRes.push_back(dwarf::DW_OP_LLVM_arg);
+  OpsRes.push_back(0);
+  OpsRes.push_back(dwarf::DW_OP_constu);
+  OpsRes.push_back(2);
+  OpsRes.push_back(dwarf::DW_OP_mul);
+  ResExpr = DIExpression::get(Context, OpsRes);
+  EXPECT_EQ(ResExpr, AppendExpr);
+}
+
 TEST_F(DIExpressionTest, isValid) {
 #define EXPECT_VALID(...)                                                      \
   do {                                                                         \

From f4681be06b465736cc993b114dd8e2625d37b779 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 29 May 2024 16:25:02 -0700
Subject: [PATCH 201/230] Use DIExpression::foldConstantMath at the result of a
 Salvaged expression (#71721)

This patch uses `DIExpression::foldConstantMath()` at the result of a
Salvaged expression, that is, it runs the folding optimizations after an
expression has been salvaged completely, to reduce how many times the
fold optimization function is called. Which should help in reducing the
size of DIExpressions that grow because of salvaging debug info

After checking the size of the dSYM with and without this change, I saw
a decrease of about 300KB, where the debug_loc section is about 1.6 GB
in size.

Where the debug loc section reduced in size by 212KB and it is 193MB in
size, the rest comes from the debug_info section

This is part of a stack of patches and comes after:
https://github.com/llvm/llvm-project/pull/69768
https://github.com/llvm/llvm-project/pull/71717
https://github.com/llvm/llvm-project/pull/71718
https://github.com/llvm/llvm-project/pull/71719
---
 llvm/lib/Transforms/Utils/Local.cpp                 | 4 ++++
 llvm/test/DebugInfo/salvage-icmp.ll                 | 2 +-
 llvm/test/DebugInfo/salvage-limit-expr-size.ll      | 8 ++++----
 llvm/test/Transforms/InstCombine/cast-mul-select.ll | 2 +-
 llvm/test/Transforms/InstCombine/debuginfo-dce.ll   | 2 +-
 llvm/test/Transforms/InstCombine/debuginfo-sink.ll  | 2 +-
 llvm/unittests/Transforms/Utils/LocalTest.cpp       | 3 +--
 7 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f3cd3104c31280..ce0f4c7668a40e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2230,6 +2230,8 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
   assert(!SalvagedExpr->getFragmentInfo().has_value() &&
          "address-expression shouldn't have fragment info");
 
+  SalvagedExpr = SalvagedExpr->foldConstantMath();
+
   // Salvage succeeds if no additional values are required.
   if (AdditionalValues.empty()) {
     Assign->setAddress(NewV);
@@ -2290,6 +2292,7 @@ void llvm::salvageDebugInfoForDbgValues(
     if (!Op0)
       break;
 
+    SalvagedExpr = SalvagedExpr->foldConstantMath();
     DII->replaceVariableLocationOp(&I, Op0);
     bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize;
     if (AdditionalValues.empty() && IsValidSalvageExpr) {
@@ -2351,6 +2354,7 @@ void llvm::salvageDebugInfoForDbgValues(
     if (!Op0)
       break;
 
+    SalvagedExpr = SalvagedExpr->foldConstantMath();
     DVR->replaceVariableLocationOp(&I, Op0);
     bool IsValidSalvageExpr =
         SalvagedExpr->getNumElements() <= MaxExpressionSize;
diff --git a/llvm/test/DebugInfo/salvage-icmp.ll b/llvm/test/DebugInfo/salvage-icmp.ll
index ce9e809e12c1a8..f47c20e7992e65 100644
--- a/llvm/test/DebugInfo/salvage-icmp.ll
+++ b/llvm/test/DebugInfo/salvage-icmp.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: call void @llvm.dbg.value(metadata i32 %a,
 ; CHECK-SAME: ![[VAR_C:[0-9]+]],
-; CHECK-SAME: !DIExpression(DW_OP_constu, 0, DW_OP_ne, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 0, DW_OP_eq, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 1, DW_OP_gt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551615, DW_OP_gt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 2, DW_OP_ge, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551614, DW_OP_ge, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 3, DW_OP_lt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551613, DW_OP_lt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 4, DW_OP_le, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551612, DW_OP_le, DW_OP_stack_value))
+; CHECK-SAME: !DIExpression(DW_OP_lit0, DW_OP_ne, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_lit0, DW_OP_eq, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 1, DW_OP_gt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551615, DW_OP_gt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 2, DW_OP_ge, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551614, DW_OP_ge, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 3, DW_OP_lt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551613, DW_OP_lt, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_constu, 4, DW_OP_le, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_consts, 18446744073709551612, DW_OP_le, DW_OP_stack_value))
 
 ; CHECK: call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %a, i32 %a, i32 %b, i32 %a, i32 %b, i32 %b, i32 %a, i32 %a, i32 %b, i32 %b),
 ; CHECK-SAME: ![[VAR_C:[0-9]+]],
diff --git a/llvm/test/DebugInfo/salvage-limit-expr-size.ll b/llvm/test/DebugInfo/salvage-limit-expr-size.ll
index 94e451327b2148..379a4ecec43eb9 100644
--- a/llvm/test/DebugInfo/salvage-limit-expr-size.ll
+++ b/llvm/test/DebugInfo/salvage-limit-expr-size.ll
@@ -11,13 +11,13 @@ entry:
   ;; These expressions should salvage successfully, up to exactly 128 elements.
   ; CHECK: call void @llvm.dbg.value(metadata i32 %a, metadata ![[VAR_C:[0-9]+]]
   ; CHECK-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %b), metadata ![[VAR_C]]
-  call void @llvm.dbg.value(metadata i32 %add.1, metadata !12, metadata !DIExpression(DW_OP_constu, 1, DW_OP_plus, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !13
-  call void @llvm.dbg.value(metadata i32 %add.2, metadata !12, metadata !DIExpression(DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !13
+    call void @llvm.dbg.value(metadata i32 %add.1, metadata !12, metadata !DIExpression(DW_OP_lit0, DW_OP_ne, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned,  DW_OP_stack_value)), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %add.2, metadata !12, metadata !DIExpression(DW_OP_lit0, DW_OP_ne, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_stack_value)), !dbg !13
   ;; These expressions should be set undef, as they would salvage up to exactly 129 elements.
   ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 poison, metadata ![[VAR_C]]
   ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 poison, metadata ![[VAR_C]]
-  call void @llvm.dbg.value(metadata i32 %add.1, metadata !12, metadata !DIExpression(DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !13
-  call void @llvm.dbg.value(metadata i32 %add.2, metadata !12, metadata !DIExpression(DW_OP_constu, 1, DW_OP_plus, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %add.1, metadata !12, metadata !DIExpression(DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_stack_value)), !dbg !13
+  call void @llvm.dbg.value(metadata i32 %add.2, metadata !12, metadata !DIExpression(DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_stack_value)), !dbg !13
   %mul = mul nsw i32 %a, %b, !dbg !15
   ret i32 %mul, !dbg !15
 }
diff --git a/llvm/test/Transforms/InstCombine/cast-mul-select.ll b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
index 50769ebe76f5c5..79d7ac9e6d0bed 100644
--- a/llvm/test/Transforms/InstCombine/cast-mul-select.ll
+++ b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
@@ -207,7 +207,7 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) {
 ; DBGINFO-NEXT:  entry:
 ; DBGINFO-NEXT:    br label [[WHILE_BODY:%.*]], !dbg [[DBG94:![0-9]+]]
 ; DBGINFO:       while.body:
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[B:%.*]], metadata [[META89:![0-9]+]], metadata !DIExpression(DW_OP_constu, 0, DW_OP_eq, DW_OP_stack_value)), !dbg [[DBG95:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[B:%.*]], metadata [[META89:![0-9]+]], metadata !DIExpression(DW_OP_lit0, DW_OP_eq, DW_OP_stack_value)), !dbg [[DBG95:![0-9]+]]
 ; DBGINFO-NEXT:    br i1 [[C1:%.*]], label [[FOR_BODY3_US:%.*]], label [[FOR_BODY3:%.*]], !dbg [[DBG96:![0-9]+]]
 ; DBGINFO:       for.body3.us:
 ; DBGINFO-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[B]], 0, !dbg [[DBG95]]
diff --git a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
index 257222cb70c238..27e0580804cae7 100644
--- a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
+++ b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -61,7 +61,7 @@ entry:
 ; CHECK: define void @salvage_gep0
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr %queue,
-; CHECK-SAME:                           metadata !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_plus_uconst, 0, DW_OP_stack_value))
+; CHECK-SAME:                           metadata !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_stack_value))
   store ptr %1, ptr %im_not_dead, align 8
   ret void, !dbg !26
 }
diff --git a/llvm/test/Transforms/InstCombine/debuginfo-sink.ll b/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
index 311948262fac23..63f09d26cce4f9 100644
--- a/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
+++ b/llvm/test/Transforms/InstCombine/debuginfo-sink.ll
@@ -62,7 +62,7 @@ sink2:
 ; CHECK:       call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
 ; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value))
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata ptr %a, metadata !{{[0-9]+}},
-; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_plus_uconst, 5, DW_OP_stack_value))
+; CHECK-SAME:  metadata !DIExpression(DW_OP_plus_uconst, 9, DW_OP_stack_value))
 ; CHECK-NEXT:  br label %sink1
 
 define i32 @baz(ptr %a) !dbg !80 {
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 32c5244d3ff505..9b1176765c17f1 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -544,8 +544,7 @@ struct SalvageDebugInfoTest : ::testing::Test {
     const auto &CI = *cast<ConstantInt>(DI.getVariableLocationOp(0));
     if (CI.isZero())
       return DI.getExpression()->getElements().equals(
-          {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_plus_uconst, 2,
-           dwarf::DW_OP_stack_value});
+          {dwarf::DW_OP_plus_uconst, 3, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
       return DI.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 2, dwarf::DW_OP_stack_value});

From 424f82c204173889a93a74910e63dc53931c3ec9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 12:24:10 -0700
Subject: [PATCH 202/230] [RISCV] Refactor combineTruncToVnclipu to prepare for
 adding signed vnclip support. NFC

Reviewed as part of #93728.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 51 +++++++++++++--------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0242cfe1785246..ff21e10392a800 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16185,8 +16185,8 @@ static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
 
 // Combine (truncate_vector_vl (umin X, C)) -> (vnclipu_vl X) if C is maximum
 // value for the truncated type.
-static SDValue combineTruncToVnclipu(SDNode *N, SelectionDAG &DAG,
-                                     const RISCVSubtarget &Subtarget) {
+static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
+                                    const RISCVSubtarget &Subtarget) {
   assert(N->getOpcode() == RISCVISD::TRUNCATE_VECTOR_VL);
 
   MVT VT = N->getSimpleValueType(0);
@@ -16194,15 +16194,15 @@ static SDValue combineTruncToVnclipu(SDNode *N, SelectionDAG &DAG,
   SDValue Mask = N->getOperand(1);
   SDValue VL = N->getOperand(2);
 
-  SDValue Src = N->getOperand(0);
+  auto MatchMinMax = [&VL, &Mask](SDValue V, unsigned Opc, unsigned OpcVL,
+                                  APInt &SplatVal) {
+    if (V.getOpcode() != Opc &&
+        !(V.getOpcode() == OpcVL && V.getOperand(2).isUndef() &&
+          V.getOperand(3) == Mask && V.getOperand(4) == VL))
+      return SDValue();
 
-  // Src must be a UMIN or UMIN_VL.
-  if (Src.getOpcode() != ISD::UMIN &&
-      !(Src.getOpcode() == RISCVISD::UMIN_VL && Src.getOperand(2).isUndef() &&
-        Src.getOperand(3) == Mask && Src.getOperand(4) == VL))
-    return SDValue();
+    SDValue Op = V.getOperand(1);
 
-  auto IsSplat = [&VL](SDValue Op, APInt &SplatVal) {
     // Peek through conversion between fixed and scalable vectors.
     if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
         isNullConstant(Op.getOperand(2)) &&
@@ -16213,32 +16213,45 @@ static SDValue combineTruncToVnclipu(SDNode *N, SelectionDAG &DAG,
       Op = Op.getOperand(1).getOperand(0);
 
     if (ISD::isConstantSplatVector(Op.getNode(), SplatVal))
-      return true;
+      return V.getOperand(0);
 
     if (Op.getOpcode() == RISCVISD::VMV_V_X_VL && Op.getOperand(0).isUndef() &&
         Op.getOperand(2) == VL) {
       if (auto *Op1 = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
         SplatVal =
             Op1->getAPIntValue().sextOrTrunc(Op.getScalarValueSizeInBits());
-        return true;
+        return V.getOperand(0);
       }
     }
 
-    return false;
+    return SDValue();
   };
 
-  APInt C;
-  if (!IsSplat(Src.getOperand(1), C))
-    return SDValue();
+  auto DetectUSatPattern = [&](SDValue V) {
+    // Src must be a UMIN or UMIN_VL.
+    APInt C;
+    SDValue UMin = MatchMinMax(V, ISD::UMIN, RISCVISD::UMIN_VL, C);
+    if (!UMin)
+      return SDValue();
+
+    if (!C.isMask(VT.getScalarSizeInBits()))
+      return SDValue();
 
-  if (!C.isMask(VT.getScalarSizeInBits()))
+    return UMin;
+  };
+
+  SDValue Val;
+  unsigned ClipOpc;
+  if ((Val = DetectUSatPattern(N->getOperand(0))))
+    ClipOpc = RISCVISD::VNCLIPU_VL;
+  else
     return SDValue();
 
   SDLoc DL(N);
   // Rounding mode here is arbitrary since we aren't shifting out any bits.
   return DAG.getNode(
-      RISCVISD::VNCLIPU_VL, DL, VT,
-      {Src.getOperand(0), DAG.getConstant(0, DL, VT), DAG.getUNDEF(VT), Mask,
+      ClipOpc, DL, VT,
+      {Val, DAG.getConstant(0, DL, VT), DAG.getUNDEF(VT), Mask,
        DAG.getTargetConstant(RISCVVXRndMode::RNU, DL, Subtarget.getXLenVT()),
        VL});
 }
@@ -16462,7 +16475,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::TRUNCATE_VECTOR_VL:
     if (SDValue V = combineTruncOfSraSext(N, DAG))
       return V;
-    return combineTruncToVnclipu(N, DAG, Subtarget);
+    return combineTruncToVnclip(N, DAG, Subtarget);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:

From ea1ecb50fa831583241fc531153bd2c072955d29 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 29 May 2024 23:36:43 +0000
Subject: [PATCH 203/230] Fix test - remove unnecessary/incorrect `-S`, in
 favor of `-emit-llvm`

---
 clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
index 8e465a1febf7ce..0885e7076d51c5 100644
--- a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
+++ b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
@@ -1,6 +1,6 @@
 // Test debug info for intermediate value of a chained pointer deferencing
 // expression when the flag -fdebug-info-for-pointer-type is enabled.
-// RUN: %clang_cc1 %s -fdebug-info-for-profiling -debug-info-kind=constructor -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fdebug-info-for-profiling -debug-info-kind=constructor -emit-llvm -o - | FileCheck %s
 
 class A {
 public:

From 2e0cfe69d0d705e9c5d5f217625bf7e3a0e90871 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 29 May 2024 16:41:12 -0700
Subject: [PATCH 204/230] [ELF] Simplify getSectionRank

Follow-up to a previous simplification
2473b1af085ad54e89666cedf684fdf10a84f058.

The xor difference between a SHT_NOTE and a read-only SHT_PROGBITS
(previously >=NOT_SPECIAL) should be smaller than RF_EXEC. Otherwise,
for the following section layout, `findOrphanPos` would place .text
before note.

```
// simplified from linkerscript/custom-section-type.s
non orphans:
progbits 0x8060c00 NOT_SPECIAL
note     0x8040003

orphan:
.text    0x8061000 NOT_SPECIAL
```
---
 lld/ELF/Writer.cpp | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index d2cc6d8ff5f2cb..c498153f3348b1 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -618,7 +618,6 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
-  RF_NOT_SPECIAL = 1 << 17,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
@@ -644,24 +643,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   if (!(osec.flags & SHF_ALLOC))
     return rank | RF_NOT_ALLOC;
 
-  if (osec.type == SHT_LLVM_PART_EHDR)
-    return rank;
-  if (osec.type == SHT_LLVM_PART_PHDR)
-    return rank | 1;
-
-  // Put .interp first because some loaders want to see that section
-  // on the first page of the executable file when loaded into memory.
-  if (osec.name == ".interp")
-    return rank | 2;
-
-  // Put .note sections at the beginning so that they are likely to be included
-  // in a truncate core file. In particular, .note.gnu.build-id, if available,
-  // can identify the object file.
-  if (osec.type == SHT_NOTE)
-    return rank | 3;
-
-  rank |= RF_NOT_SPECIAL;
-
   // Sort sections based on their access permission in the following
   // order: R, RX, RXW, RW(RELRO), RW(non-RELRO).
   //
@@ -677,11 +658,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   bool isWrite = osec.flags & SHF_WRITE;
 
   if (!isWrite && !isExec) {
-    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
-    // alleviate relocation overflow pressure. Large special sections such as
-    // .dynstr and .dynsym can be away from .text.
-    if (osec.type == SHT_PROGBITS)
-      rank |= RF_RODATA;
     // Among PROGBITS sections, place .lrodata further from .text.
     // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This
     // layout has one extra PT_LOAD, but alleviates relocation overflow
@@ -691,6 +667,25 @@ unsigned elf::getSectionRank(OutputSection &osec) {
       rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= config->zLrodataAfterBss ? 0 : RF_LARGE;
+
+    if (osec.type == SHT_LLVM_PART_EHDR)
+      ;
+    else if (osec.type == SHT_LLVM_PART_PHDR)
+      rank |= 1;
+    else if (osec.name == ".interp")
+      rank |= 2;
+    // Put .note sections at the beginning so that they are likely to be
+    // included in a truncate core file. In particular, .note.gnu.build-id, if
+    // available, can identify the object file.
+    else if (osec.type == SHT_NOTE)
+      rank |= 3;
+    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
+    // alleviate relocation overflow pressure. Large special sections such as
+    // .dynstr and .dynsym can be away from .text.
+    else if (osec.type != SHT_PROGBITS)
+      rank |= 4;
+    else
+      rank |= RF_RODATA;
   } else if (isExec) {
     rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
   } else {

From 8a8cd8a766081eebaf4dd51a1012d093713dfe59 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 16:46:36 -0700
Subject: [PATCH 205/230] [RISCV] Move vnclip patterns into DAGCombiner.
 (#93728)

Similar to #93596, this moves the signed vnclip patterns into DAG
combine.

This will allows us to support more than 1 level of truncate in a
future patch.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 29 +++++++++++++-
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 34 ----------------
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 40 -------------------
 3 files changed, 27 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff21e10392a800..0e7713509e969b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16183,8 +16183,11 @@ static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
 }
 
-// Combine (truncate_vector_vl (umin X, C)) -> (vnclipu_vl X) if C is maximum
-// value for the truncated type.
+// Combine (truncate_vector_vl (umin X, C)) -> (vnclipu_vl X) if C is the
+// maximum value for the truncated type.
+// Combine (truncate_vector_vl (smin (smax X, C2), C1)) -> (vnclip_vl X) if C1
+// is the signed maximum value for the truncated type and C2 is the signed
+// minimum value.
 static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
                                     const RISCVSubtarget &Subtarget) {
   assert(N->getOpcode() == RISCVISD::TRUNCATE_VECTOR_VL);
@@ -16240,10 +16243,32 @@ static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
     return UMin;
   };
 
+  auto DetectSSatPattern = [&](SDValue V) {
+    unsigned NumDstBits = VT.getScalarSizeInBits();
+    unsigned NumSrcBits = V.getScalarValueSizeInBits();
+    APInt SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
+    APInt SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
+
+    APInt CMin, CMax;
+    if (SDValue SMin = MatchMinMax(V, ISD::SMIN, RISCVISD::SMIN_VL, CMin))
+      if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, RISCVISD::SMAX_VL, CMax))
+        if (CMin == SignedMax && CMax == SignedMin)
+          return SMax;
+
+    if (SDValue SMax = MatchMinMax(V, ISD::SMAX, RISCVISD::SMAX_VL, CMax))
+      if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, RISCVISD::SMIN_VL, CMin))
+        if (CMin == SignedMax && CMax == SignedMin)
+          return SMin;
+
+    return SDValue();
+  };
+
   SDValue Val;
   unsigned ClipOpc;
   if ((Val = DetectUSatPattern(N->getOperand(0))))
     ClipOpc = RISCVISD::VNCLIPU_VL;
+  else if ((Val = DetectSSatPattern(N->getOperand(0))))
+    ClipOpc = RISCVISD::VNCLIP_VL;
   else
     return SDValue();
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 691f2052ab29d8..3163e4bafd4b0d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1168,40 +1168,6 @@ defm : VPatAVGADD_VV_VX_RM<avgflooru, 0b10, suffix = "U">;
 defm : VPatAVGADD_VV_VX_RM<avgceils, 0b00>;
 defm : VPatAVGADD_VV_VX_RM<avgceilu, 0b00, suffix = "U">;
 
-// 12.5. Vector Narrowing Fixed-Point Clip Instructions
-multiclass VPatTruncSatClipSDNode<VTypeInfo vti, VTypeInfo wti> {
-  defvar sew = vti.SEW;
-  defvar uminval = !sub(!shl(1, sew), 1);
-  defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1);
-  defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));
-
-  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                               GetVTypePredicates<wti>.Predicates) in {
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (smin
-          (wti.Vector (smax (wti.Vector wti.RegClass:$rs1),
-            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), smaxval, (XLenVT srcvalue))))),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), sminval, (XLenVT srcvalue))))),
-        (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (smax
-          (wti.Vector (smin (wti.Vector wti.RegClass:$rs1),
-            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), sminval, (XLenVT srcvalue))))),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), smaxval, (XLenVT srcvalue))))),
-        (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-  }
-}
-
-foreach vtiToWti = AllWidenableIntVectors in
-  defm : VPatTruncSatClipSDNode<vtiToWti.Vti, vtiToWti.Wti>;
-
 // 15. Vector Mask Instructions
 
 // 15.1. Vector Mask-Register Logical Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 610a72dd02b388..ce8133a5a297b9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2470,46 +2470,6 @@ defm : VPatAVGADDVL_VV_VX_RM<riscv_avgceilu_vl, 0b00, suffix="U">;
 defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclip_vl, "PseudoVNCLIP">;
 defm : VPatBinaryRM_NVL_WV_WX_WI<riscv_vnclipu_vl, "PseudoVNCLIPU">;
 
-// 12.5. Vector Narrowing Fixed-Point Clip Instructions
-multiclass VPatTruncSatClipVL<VTypeInfo vti, VTypeInfo wti> {
-  defvar sew = vti.SEW;
-  defvar uminval = !sub(!shl(1, sew), 1);
-  defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1);
-  defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));
-
-  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                               GetVTypePredicates<wti>.Predicates) in {
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (riscv_smin_vl
-          (wti.Vector (riscv_smax_vl
-            (wti.Vector wti.RegClass:$rs1),
-            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), smaxval, (XLenVT srcvalue))),
-            (wti.Vector undef),(wti.Mask V0), VLOpFrag)),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), sminval, (XLenVT srcvalue))),
-          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
-        (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-
-    def : Pat<(vti.Vector (riscv_trunc_vector_vl
-        (wti.Vector (riscv_smax_vl
-          (wti.Vector (riscv_smin_vl
-            (wti.Vector wti.RegClass:$rs1),
-            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), sminval, (XLenVT srcvalue))),
-            (wti.Vector undef),(wti.Mask V0), VLOpFrag)),
-          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), smaxval, (XLenVT srcvalue))),
-          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
-        (vti.Mask V0), VLOpFrag)),
-      (!cast<Instruction>("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK")
-        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
-        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
-  }
-}
-
-foreach vtiToWti = AllWidenableIntVectors in
-  defm : VPatTruncSatClipVL<vtiToWti.Vti, vtiToWti.Wti>;
-
 // 13. Vector Floating-Point Instructions
 
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions

From c0873fa20eb5dcba303a003bdd5192d341f89eaa Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 29 May 2024 15:26:16 -0700
Subject: [PATCH 206/230] [RISCV] Add trunc-sat-clip tests for i32->i8,
 i64->i8, and i64->i16. NFC

These can be implemented with multiple vnclips.
---
 .../RISCV/rvv/fixed-vectors-trunc-sat-clip.ll | 374 ++++++++++++++++++
 .../RISCV/rvv/trunc-sat-clip-sdnode.ll        | 353 +++++++++++++++++
 2 files changed, 727 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
index 414b23ffb582ab..9f82eddf432da7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll
@@ -410,3 +410,377 @@ define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
   store <4 x i32> %4, ptr %y, align 8
   ret void
 }
+
+define void @trunc_sat_i8i32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>)
+  %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %1, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>)
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u32_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
+  %3 = trunc <4 x i32> %2 to <4 x i8>
+  store <4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> zeroinitializer)
+  %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 16
+  %2 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %1, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
+  %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> zeroinitializer)
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> <i64 -128, i64 -128, i64 -128, i64 -128>)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 127, i64 127, i64 127, i64 127>)
+  %4 = trunc <4 x i64> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 127, i64 127, i64 127, i64 127>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> <i64 -128, i64 -128, i64 -128, i64 -128>)
+  %4 = trunc <4 x i64> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u64_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 255, i64 255, i64 255, i64 255>)
+  %3 = trunc <4 x i64> %2 to <4 x i8>
+  store <4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> zeroinitializer)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 255, i64 255, i64 255, i64 255>)
+  %4 = trunc <4 x i64> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 255, i64 255, i64 255, i64 255>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> zeroinitializer)
+  %4 = trunc <4 x i64> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i16i64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 32
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768>)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 32767, i64 32767, i64 32767, i64 32767>)
+  %4 = trunc <4 x i64> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_i16i64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 32
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 32767, i64 32767, i64 32767, i64 32767>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768>)
+  %4 = trunc <4 x i64> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u64_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 32
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 32767, i64 32767, i64 32767, i64 32767>)
+  %3 = trunc <4 x i64> %2 to <4 x i16>
+  store <4 x i16> %3, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u64_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 32
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535>)
+  %3 = trunc <4 x i64> %2 to <4 x i16>
+  store <4 x i16> %3, ptr %y, align 16
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535>)
+  %4 = trunc <4 x i64> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 50
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 16
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> <i64 50, i64 50, i64 50, i64 50>)
+  %4 = trunc <4 x i64> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
index fcb49c21871919..78e8f0fbbbdd7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip-sdnode.ll
@@ -410,3 +410,356 @@ define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
   store <vscale x 4 x i32> %4, ptr %y, align 8
   ret void
 }
+
+define void @trunc_sat_i8i32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 -128))
+  %3 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 127))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 127))
+  %3 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 -128))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u32_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.umin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 255))
+  %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 0))
+  %3 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 255))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i32> @llvm.smin.v4i32(<vscale x 4 x i32> %1, <vscale x 4 x i32> splat (i32 255))
+  %3 = tail call <vscale x 4 x i32> @llvm.smax.v4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> splat (i32 0))
+  %4 = trunc <vscale x 4 x i32> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 -128))
+  %3 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 127))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 127))
+  %3 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 -128))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u64_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.umin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 255))
+  %3 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 0))
+  %3 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 255))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u8u64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    li a0, 255
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vmax.vx v8, v8, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 255))
+  %3 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 0))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i8>
+  store <vscale x 4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i16i64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 32
+  %2 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 -32768))
+  %3 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 32767))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_i16i64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 32
+  %2 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 32767))
+  %3 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 -32768))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u64_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 32
+  %2 = tail call <vscale x 4 x i64> @llvm.umin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 65535))
+  %3 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %3, ptr %y, align 16
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 1))
+  %3 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 65535))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 8
+  ret void
+}
+
+; FIXME: This can be a signed vmax followed by vnclipu.
+define void @trunc_sat_u16u64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl4re64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 50
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i64>, ptr %x, align 16
+  %2 = tail call <vscale x 4 x i64> @llvm.smin.v4i64(<vscale x 4 x i64> %1, <vscale x 4 x i64> splat (i64 65535))
+  %3 = tail call <vscale x 4 x i64> @llvm.smax.v4i64(<vscale x 4 x i64> %2, <vscale x 4 x i64> splat (i64 50))
+  %4 = trunc <vscale x 4 x i64> %3 to <vscale x 4 x i16>
+  store <vscale x 4 x i16> %4, ptr %y, align 8
+  ret void
+}

From 3cee5672fd645cd58b8c4ee6187f10b509a34d77 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 29 May 2024 23:55:42 +0000
Subject: [PATCH 207/230] [gn build] Port b12f81b53ad6

---
 llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
index 247ef480f5f1a4..de0c661aaf7c15 100644
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -28,6 +28,7 @@ static_library("IR") {
     "Core.cpp",
     "CycleInfo.cpp",
     "DIBuilder.cpp",
+    "DIExpressionOptimizer.cpp",
     "DataLayout.cpp",
     "DebugInfo.cpp",
     "DebugInfoMetadata.cpp",

From 246234ac70faa1e3281a2bb83dfc4dd206a7d59c Mon Sep 17 00:00:00 2001
From: Charlie Barto <chbarto@microsoft.com>
Date: Wed, 29 May 2024 17:24:45 -0700
Subject: [PATCH 208/230] [asan][windows] Eliminate the static asan runtime on
 windows (#81677)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is one of the major changes we (Microsoft) have made in the version
of asan we ship with Visual Studio.

@amyw-msft wrote a blog post outlining this work at
https://devblogs.microsoft.com/cppblog/msvc-address-sanitizer-one-dll-for-all-runtime-configurations/

> With Visual Studio 2022 version 17.7 Preview 3, we have refactored the
MSVC Address Sanitizer (ASan) to depend on one runtime DLL regardless of
the runtime configuration. This simplifies project onboarding and
supports more scenarios, particularly for projects statically linked
(/MT, /MTd) to the C Runtimes. However, static configurations have a new
dependency on the ASan runtime DLL.

> Summary of the changes:

> ASan now works with /MT or /MTd built DLLs when the host EXE was not
compiled with ASan. This includes Windows services, COM components, and
plugins.
Configuring your project with ASan is now simpler, since your project
doesn’t need to uniformly specify the same [runtime
configuration](https://learn.microsoft.com/en-us/cpp/build/reference/md-mt-ld-use-run-time-library?view=msvc-170)
(/MT, /MTd, /MD, /MDd).
ASan workflows and pipelines for /MT or /MTd built projects will need to
ensure the ASan DLL (clang_rt.asan_dynamic-<arch>.dll) is available on
PATH.
The names of the ASan .lib files needed by the linker have changed (the
linker normally takes care of this if not manually specifying lib names
via /INFERASANLIBS)
You cannot mix ASan-compiled binaries from previous versions of the MSVC
Address Sanitizer (this is always true, but especially true in this
case).

Here's the description of these changes from our internal PR

1. Build one DLL that includes everything debug mode needs (not included
here, already contributed upstream).
* Remove #if _DEBUG checks everywhere.
* In some places, this needed to be replaced with a runtime check. In
asan_win.cpp, IsDebugRuntimePresent was added where we are searching for
allocations prior to ASAN initialization.
* In asan_win_runtime_functions.cpp and interception_win.cpp, we need to
be aware of debug runtime DLLs even when not built with _DEBUG.
2. Redirect statically linked functions to the ASAN DLL for /MT
* New exports for each of the C allocation APIs so that the statically
linked portion of the runtime can call them (see asan_malloc_win.cpp,
search MALLOC_DLL_EXPORT). Since we want our stack trace information to
be accurate and without noise, this means we need to capture stack frame
info from the original call and tell it to our DLL export. For this, I
have reused the __asan_win_new_delete_data used for op new/delete
support from asan_win_new_delete_thunk_common.h and moved it into
asan_win_thunk_common.h renamed as __asan_win_stack_data.
* For the C allocation APIs, a new file is included in the
statically-linked /WHOLEARCHIVE lib - asan_malloc_win_thunk.cpp. These
functions simply provide definitions for malloc/free/etc to be used
instead of the UCRT's definitions for /MT and instead call the ASAN DLL
export. /INFERASANLIBS ensures libucrt.lib will not take precedence via
/WHOLEARCHIVE.
* For other APIs, the interception code was called, so a new export is
provided: __sanitizer_override_function.
__sanitizer_override_function_by_addr is also provided to support
__except_handler4 on x86 (due to the security cookie being per-module).
3. Support weak symbols for /MD
* We have customers (CoreCLR) that rely on this behavior and would force
/MT to get it.
* There was sanitizer_win_weak_interception.cpp before, which did some
stuff for setting up the .WEAK section, but this only worked on /MT. Now
stuff registered in the .WEAK section is passed to the ASAN DLL via new
export __sanitizer_register_weak_function (impl in
sanitizer_win_interception.cpp). Unlike linux, multiple weak symbol
registrations are possible here. Current behavior is to give priority on
module load order such that whoever loads last (so priority is given to
the EXE) will have their weak symbol registered.
* Unfortunately, the registration can only occur during the user module
startup, which is after ASAN DLL startup, so any weak symbols used by
ASAN during initialization will not be picked up. This is most notable
for __asan_default_options and friends (see asan_flags.cpp). A mechanism
was made to add a callback for when a certain weak symbol was
registered, so now we process __asan_default_options during module
startup instead of ASAN startup. This is a change in behavior, but
there's no real way around this due to how DLLs are.
4. Build reorganization
* I noticed that our current build configuration is very MSVC-specific
and so did a bit of reworking. Removed a lot of
create_multiple_windows_obj_lib use since it's no longer needed and it
changed how we needed to refer to each object_lib by adding runtime
configuration to the name, conflicting with how it works for non-MSVC.
* No more Win32 static build, use /MD everywhere.
* Building with /Zl to avoid defaultlib warnings.

In addition:
* I've reapplied "[sanitizer][asan][win] Intercept _strdup on Windows
instead of strdup" which broke the previous static asan runtime. That
runtime is gone now and this change is required for the strdup tests to
work.
* I've modified the MSVC clang driver to support linking the correct
asan libraries, including via defining _DLL (which triggers different
defaultlibs and should result in the asan dll thunk being linked, along
with the dll CRT (via defaultlib directives).
* I've made passing -static-libsan an error on windows, and made
-shared-libsan the default. I'm not sure I did this correctly, or in the
best way.
* Modified the test harnesses to add substitutions for the dynamic and
static thunks and to make the library substitutions point to the dynamic
asan runtime for all test configurations on windows. Both the static and
dynamic windows test configurations remain, because they correspond to
the static and dynamic CRT, not the static and dynamic asan runtime
library.

---------

Co-authored-by: Amy Wishnousky <amyw@microsoft.com>
---
 clang/lib/Driver/SanitizerArgs.cpp            |  14 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |  26 +-
 clang/test/Driver/cl-link.c                   |  10 +-
 compiler-rt/CMakeLists.txt                    |   8 +-
 compiler-rt/lib/asan/CMakeLists.txt           | 159 ++++++------
 compiler-rt/lib/asan/asan_flags.cpp           |  96 +++++++-
 compiler-rt/lib/asan/asan_globals_win.cpp     |   4 +-
 compiler-rt/lib/asan/asan_malloc_win.cpp      |  97 ++++----
 .../lib/asan/asan_malloc_win_thunk.cpp        | 229 ++++++++++++++++++
 .../asan/asan_win_common_runtime_thunk.cpp    | 112 +++++++++
 .../lib/asan/asan_win_common_runtime_thunk.h  |  38 +++
 compiler-rt/lib/asan/asan_win_dll_thunk.cpp   | 165 -------------
 .../asan/asan_win_dynamic_runtime_thunk.cpp   | 104 ++------
 .../asan/asan_win_static_runtime_thunk.cpp    | 110 +++++++++
 compiler-rt/lib/asan/tests/CMakeLists.txt     |   2 +-
 compiler-rt/lib/profile/CMakeLists.txt        |   6 +
 .../lib/sanitizer_common/CMakeLists.txt       |  55 +----
 .../sanitizer_common_interface.inc            |   6 +
 .../sanitizer_coverage_win_dll_thunk.cpp      |  20 --
 ... sanitizer_coverage_win_runtime_thunk.cpp} |  21 +-
 ...nitizer_coverage_win_weak_interception.cpp |  23 --
 .../sanitizer_win_dll_thunk.cpp               | 101 --------
 .../sanitizer_win_dll_thunk.h                 | 181 --------------
 .../sanitizer_win_dynamic_runtime_thunk.cpp   |  26 --
 .../sanitizer_win_immortalize.h               |  71 ++++++
 .../sanitizer_win_interception.cpp            | 156 ++++++++++++
 .../sanitizer_win_interception.h              |  32 +++
 .../sanitizer_win_thunk_interception.cpp      | 110 +++++++++
 .../sanitizer_win_thunk_interception.h        |  81 +++++++
 .../sanitizer_win_weak_interception.cpp       |  94 -------
 .../sanitizer_win_weak_interception.h         |  32 ---
 compiler-rt/lib/ubsan/CMakeLists.txt          |  27 +--
 compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp |  20 --
 ..._thunk.cpp => ubsan_win_runtime_thunk.cpp} |  11 +-
 .../lib/ubsan/ubsan_win_weak_interception.cpp |  23 --
 .../Darwin/interface_symbols_darwin.cpp       |   3 +
 .../Linux/interface_symbols_linux.cpp         |   3 +
 .../asan/TestCases/Windows/double_free.cpp    |   6 +-
 .../TestCases/Windows/free_hook_realloc.cpp   |   3 -
 .../TestCases/Windows/malloc_left_oob.cpp     |   2 +-
 .../TestCases/Windows/malloc_right_oob.cpp    |   2 +-
 .../asan/TestCases/Windows/malloc_uaf.cpp     |   4 +-
 .../TestCases/Windows/msvc/dll_and_lib.cpp    |   5 +-
 .../Windows/msvc/dll_large_function.cpp       |   3 +-
 .../TestCases/Windows/realloc_left_oob.cpp    |   2 +-
 .../TestCases/Windows/realloc_right_oob.cpp   |   2 +-
 .../asan/TestCases/Windows/realloc_uaf.cpp    |   4 +-
 .../asan/TestCases/Windows/symbols_path.cpp   |   2 +-
 .../asan/TestCases/Windows/unsymbolized.cpp   |   2 +-
 .../TestCases/Windows/use_after_realloc.cpp   |   4 +-
 .../test/asan/TestCases/debug_double_free.cpp |   3 -
 .../test/asan/TestCases/debug_report.cpp      |   3 -
 .../test/asan/TestCases/default_options.cpp   |   4 -
 .../test/asan/TestCases/on_error_callback.cpp |   3 -
 .../asan/TestCases/report_error_summary.cpp   |   3 -
 compiler-rt/test/asan/lit.cfg.py              |  23 +-
 56 files changed, 1287 insertions(+), 1069 deletions(-)
 create mode 100644 compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
 create mode 100644 compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
 create mode 100644 compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
 delete mode 100644 compiler-rt/lib/asan/asan_win_dll_thunk.cpp
 create mode 100644 compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
 rename compiler-rt/lib/sanitizer_common/{sanitizer_coverage_win_dynamic_runtime_thunk.cpp => sanitizer_coverage_win_runtime_thunk.cpp} (59%)
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
 delete mode 100644 compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
 rename compiler-rt/lib/ubsan/{ubsan_win_dynamic_runtime_thunk.cpp => ubsan_win_runtime_thunk.cpp} (62%)
 delete mode 100644 compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 273f215ca94a88..7b7fd2d9d47421 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -909,10 +909,16 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         DiagnoseErrors);
   }
 
-  SharedRuntime =
-      Args.hasFlag(options::OPT_shared_libsan, options::OPT_static_libsan,
-                   TC.getTriple().isAndroid() || TC.getTriple().isOSFuchsia() ||
-                       TC.getTriple().isOSDarwin());
+  SharedRuntime = Args.hasFlag(
+      options::OPT_shared_libsan, options::OPT_static_libsan,
+      TC.getTriple().isAndroid() || TC.getTriple().isOSFuchsia() ||
+          TC.getTriple().isOSDarwin() || TC.getTriple().isOSWindows());
+  if (!SharedRuntime && TC.getTriple().isOSWindows()) {
+    Arg *A =
+        Args.getLastArg(options::OPT_shared_libsan, options::OPT_static_libsan);
+    D.Diag(clang::diag::err_drv_unsupported_opt_for_target)
+        << A->getSpelling() << TC.getTriple().str();
+  }
 
   ImplicitCfiRuntime = TC.getTriple().isAndroid();
 
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index b7021d4b996ddd..bf54f04363851b 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -201,10 +201,10 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (TC.getSanitizerArgs(Args).needsAsanRt()) {
     CmdArgs.push_back(Args.MakeArgString("-debug"));
     CmdArgs.push_back(Args.MakeArgString("-incremental:no"));
-    if (TC.getSanitizerArgs(Args).needsSharedRt() ||
-        Args.hasArg(options::OPT__SLASH_MD, options::OPT__SLASH_MDd)) {
-      for (const auto &Lib : {"asan_dynamic", "asan_dynamic_runtime_thunk"})
-        CmdArgs.push_back(TC.getCompilerRTArgString(Args, Lib));
+    CmdArgs.push_back(TC.getCompilerRTArgString(Args, "asan_dynamic"));
+    auto defines = Args.getAllArgValues(options::OPT_D);
+    if (Args.hasArg(options::OPT__SLASH_MD, options::OPT__SLASH_MDd) ||
+        find(begin(defines), end(defines), "_DLL") != end(defines)) {
       // Make sure the dynamic runtime thunk is not optimized out at link time
       // to ensure proper SEH handling.
       CmdArgs.push_back(Args.MakeArgString(
@@ -213,19 +213,15 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
               : "-include:__asan_seh_interceptor"));
       // Make sure the linker consider all object files from the dynamic runtime
       // thunk.
-      CmdArgs.push_back(Args.MakeArgString(std::string("-wholearchive:") +
+      CmdArgs.push_back(Args.MakeArgString(
+          std::string("-wholearchive:") +
           TC.getCompilerRT(Args, "asan_dynamic_runtime_thunk")));
-    } else if (DLL) {
-      CmdArgs.push_back(TC.getCompilerRTArgString(Args, "asan_dll_thunk"));
     } else {
-      for (const auto &Lib : {"asan", "asan_cxx"}) {
-        CmdArgs.push_back(TC.getCompilerRTArgString(Args, Lib));
-        // Make sure the linker consider all object files from the static lib.
-        // This is necessary because instrumented dlls need access to all the
-        // interface exported by the static lib in the main executable.
-        CmdArgs.push_back(Args.MakeArgString(std::string("-wholearchive:") +
-            TC.getCompilerRT(Args, Lib)));
-      }
+      // Make sure the linker consider all object files from the static runtime
+      // thunk.
+      CmdArgs.push_back(Args.MakeArgString(
+          std::string("-wholearchive:") +
+          TC.getCompilerRT(Args, "asan_static_runtime_thunk")));
     }
   }
 
diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c
index ffd0b5ac4bade8..f5260442760452 100644
--- a/clang/test/Driver/cl-link.c
+++ b/clang/test/Driver/cl-link.c
@@ -13,10 +13,8 @@
 // ASAN: link.exe
 // ASAN: "-debug"
 // ASAN: "-incremental:no"
-// ASAN: "{{[^"]*}}clang_rt.asan.lib"
-// ASAN: "-wholearchive:{{.*}}clang_rt.asan.lib"
-// ASAN: "{{[^"]*}}clang_rt.asan_cxx.lib"
-// ASAN: "-wholearchive:{{.*}}clang_rt.asan_cxx.lib"
+// ASAN: "{{[^"]*}}clang_rt.asan_dynamic.lib"
+// ASAN: "-wholearchive:{{.*}}clang_rt.asan_static_runtime_thunk.lib"
 // ASAN: "{{.*}}cl-link{{.*}}.obj"
 
 // RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s
@@ -24,7 +22,6 @@
 // ASAN-MD: "-debug"
 // ASAN-MD: "-incremental:no"
 // ASAN-MD: "{{.*}}clang_rt.asan_dynamic.lib"
-// ASAN-MD: "{{[^"]*}}clang_rt.asan_dynamic_runtime_thunk.lib"
 // ASAN-MD: "-include:___asan_seh_interceptor"
 // ASAN-MD: "-wholearchive:{{.*}}clang_rt.asan_dynamic_runtime_thunk.lib"
 // ASAN-MD: "{{.*}}cl-link{{.*}}.obj"
@@ -40,7 +37,8 @@
 // ASAN-DLL: "-dll"
 // ASAN-DLL: "-debug"
 // ASAN-DLL: "-incremental:no"
-// ASAN-DLL: "{{.*}}clang_rt.asan_dll_thunk.lib"
+// ASAN-DLL: "{{.*}}clang_rt.asan_dynamic.lib"
+// ASAN-DLL: "-wholearchive:{{.*}}clang_rt.asan_static_runtime_thunk.lib"
 // ASAN-DLL: "{{.*}}cl-link{{.*}}.obj"
 
 // RUN: %clang_cl /Zi /Tc%s -fuse-ld=link -### 2>&1 | FileCheck --check-prefix=DEBUG %s
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 6ce451e3cac2e3..158fa270c3f15a 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -378,8 +378,12 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "s390x")
 endif()
 
 if(MSVC)
-  # FIXME: In fact, sanitizers should support both /MT and /MD, see PR20214.
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+
+  # asan on windows only supports the release dll version of the runtimes, in the interest of
+  # only having one asan dll to support/test. Having asan statically linked
+  # with the runtime might be possible, but it multiplies the number of scenerios to test.
+  # the program USING sanitizers can use whatever version of the runtime it wants to.
+  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
 
   # Remove any /M[DT][d] flags, and strip any definitions of _DEBUG.
   # Since we're using CMAKE_MSVC_RUNTIME_LIBRARY (CMP0091 set to NEW),
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index 463ea233b37aa4..f992419c6d9822 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -32,6 +32,20 @@ set(ASAN_SOURCES
   asan_win.cpp
   )
 
+if(WIN32)
+  set(ASAN_DYNAMIC_RUNTIME_THUNK_SOURCES
+    asan_globals_win.cpp
+    asan_win_common_runtime_thunk.cpp
+    asan_win_dynamic_runtime_thunk.cpp
+    )
+  set(ASAN_STATIC_RUNTIME_THUNK_SOURCES
+    asan_globals_win.cpp
+    asan_malloc_win_thunk.cpp
+    asan_win_common_runtime_thunk.cpp
+    asan_win_static_runtime_thunk.cpp
+    )
+endif()
+
 if (NOT WIN32 AND NOT APPLE)
   list(APPEND ASAN_SOURCES
     asan_interceptors_vfork.S
@@ -136,7 +150,7 @@ append_list_if(MINGW "${MINGW_LIBRARIES}" ASAN_DYNAMIC_LIBS)
 add_compiler_rt_object_libraries(RTAsan_dynamic
   OS ${SANITIZER_COMMON_SUPPORTED_OS}
   ARCHS ${ASAN_SUPPORTED_ARCH}
-  SOURCES ${ASAN_SOURCES} ${ASAN_CXX_SOURCES}
+  SOURCES ${ASAN_SOURCES}
   ADDITIONAL_HEADERS ${ASAN_HEADERS}
   CFLAGS ${ASAN_DYNAMIC_CFLAGS}
   DEFS ${ASAN_DYNAMIC_DEFINITIONS})
@@ -221,46 +235,52 @@ else()
     RTSanitizerCommonSymbolizerInternal
     RTLSanCommon
     RTUbsan)
+  if (NOT WIN32)
+    add_compiler_rt_runtime(clang_rt.asan
+      STATIC
+      ARCHS ${ASAN_SUPPORTED_ARCH}
+      OBJECT_LIBS RTAsan_preinit
+                  RTAsan
+                  ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
+      CFLAGS ${ASAN_CFLAGS}
+      DEFS ${ASAN_COMMON_DEFINITIONS}
+      PARENT_TARGET asan)
 
-  add_compiler_rt_runtime(clang_rt.asan
-    STATIC
-    ARCHS ${ASAN_SUPPORTED_ARCH}
-    OBJECT_LIBS RTAsan_preinit
-                RTAsan
-                ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
-    CFLAGS ${ASAN_CFLAGS}
-    DEFS ${ASAN_COMMON_DEFINITIONS}
-    PARENT_TARGET asan)
-
-  add_compiler_rt_runtime(clang_rt.asan_cxx
-    STATIC
-    ARCHS ${ASAN_SUPPORTED_ARCH}
-    OBJECT_LIBS RTAsan_cxx
-                RTUbsan_cxx
-    CFLAGS ${ASAN_CFLAGS}
-    DEFS ${ASAN_COMMON_DEFINITIONS}
-    PARENT_TARGET asan)
+    add_compiler_rt_runtime(clang_rt.asan_cxx
+      STATIC
+      ARCHS ${ASAN_SUPPORTED_ARCH}
+      OBJECT_LIBS RTAsan_cxx
+                  RTUbsan_cxx
+      CFLAGS ${ASAN_CFLAGS}
+      DEFS ${ASAN_COMMON_DEFINITIONS}
+      PARENT_TARGET asan)
 
-  add_compiler_rt_runtime(clang_rt.asan_static
-    STATIC
-    ARCHS ${ASAN_SUPPORTED_ARCH}
-    OBJECT_LIBS RTAsan_static
-    CFLAGS ${ASAN_CFLAGS}
-    DEFS ${ASAN_COMMON_DEFINITIONS}
-    PARENT_TARGET asan)
+    add_compiler_rt_runtime(clang_rt.asan_static
+      STATIC
+      ARCHS ${ASAN_SUPPORTED_ARCH}
+      OBJECT_LIBS RTAsan_static
+      CFLAGS ${ASAN_CFLAGS}
+      DEFS ${ASAN_COMMON_DEFINITIONS}
+      PARENT_TARGET asan)
 
-  add_compiler_rt_runtime(clang_rt.asan-preinit
-    STATIC
-    ARCHS ${ASAN_SUPPORTED_ARCH}
-    OBJECT_LIBS RTAsan_preinit
-    CFLAGS ${ASAN_CFLAGS}
-    DEFS ${ASAN_COMMON_DEFINITIONS}
-    PARENT_TARGET asan)
+    add_compiler_rt_runtime(clang_rt.asan-preinit
+      STATIC
+      ARCHS ${ASAN_SUPPORTED_ARCH}
+      OBJECT_LIBS RTAsan_preinit
+      CFLAGS ${ASAN_CFLAGS}
+      DEFS ${ASAN_COMMON_DEFINITIONS}
+      PARENT_TARGET asan)
+  endif()
 
   foreach(arch ${ASAN_SUPPORTED_ARCH})
     if (COMPILER_RT_HAS_VERSION_SCRIPT)
+      if(WIN32)
+        set(SANITIZER_RT_VERSION_LIST_LIBS clang_rt.asan-${arch})
+      else()
+        set(SANITIZER_RT_VERSION_LIST_LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch})
+      endif()
       add_sanitizer_rt_version_list(clang_rt.asan-dynamic-${arch}
-                                    LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch}
+                                    LIBS ${SANITIZER_RT_VERSION_LIST_LIBS}
                                     EXTRA asan.syms.extra)
       set(VERSION_SCRIPT_FLAG
            -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers)
@@ -278,25 +298,11 @@ else()
     endif()
 
     set(ASAN_DYNAMIC_WEAK_INTERCEPTION)
-    if (WIN32)
-      add_compiler_rt_object_libraries(AsanWeakInterception
-        ${SANITIZER_COMMON_SUPPORTED_OS}
-        ARCHS ${arch}
-        SOURCES
-          asan_win_weak_interception.cpp
-        CFLAGS ${ASAN_CFLAGS} -DSANITIZER_DYNAMIC
-        DEFS ${ASAN_COMMON_DEFINITIONS})
-      set(ASAN_DYNAMIC_WEAK_INTERCEPTION
-          AsanWeakInterception
-          UbsanWeakInterception
-          SancovWeakInterception
-          SanitizerCommonWeakInterception)
-    endif()
-
     add_compiler_rt_runtime(clang_rt.asan
       SHARED
       ARCHS ${arch}
       OBJECT_LIBS ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
+              RTAsan_cxx
               RTAsan_dynamic
               # The only purpose of RTAsan_dynamic_version_script_dummy is to
               # carry a dependency of the shared runtime on the version script.
@@ -324,36 +330,12 @@ else()
     endif()
 
     if (WIN32)
-      add_compiler_rt_object_libraries(AsanDllThunk
-        ${SANITIZER_COMMON_SUPPORTED_OS}
-        ARCHS ${arch}
-        SOURCES asan_globals_win.cpp
-                asan_win_dll_thunk.cpp
-        CFLAGS ${ASAN_CFLAGS} -DSANITIZER_DLL_THUNK
-        DEFS ${ASAN_COMMON_DEFINITIONS})
-
-      add_compiler_rt_runtime(clang_rt.asan_dll_thunk
-        STATIC
-        ARCHS ${arch}
-        OBJECT_LIBS AsanDllThunk
-                    UbsanDllThunk
-                    SancovDllThunk
-                    SanitizerCommonDllThunk
-        SOURCES $<TARGET_OBJECTS:RTInterception.${arch}>
-        PARENT_TARGET asan)
-
       set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
-      if(MSVC)
-        list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
-      elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
-        list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
-      endif()
 
       add_compiler_rt_object_libraries(AsanDynamicRuntimeThunk
         ${SANITIZER_COMMON_SUPPORTED_OS}
         ARCHS ${arch}
-        SOURCES asan_globals_win.cpp
-                asan_win_dynamic_runtime_thunk.cpp
+        SOURCES ${ASAN_DYNAMIC_RUNTIME_THUNK_SOURCES}
         CFLAGS ${ASAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
         DEFS ${ASAN_COMMON_DEFINITIONS})
 
@@ -361,12 +343,35 @@ else()
         STATIC
         ARCHS ${arch}
         OBJECT_LIBS AsanDynamicRuntimeThunk
-                    UbsanDynamicRuntimeThunk
-                    SancovDynamicRuntimeThunk
-                    SanitizerCommonDynamicRuntimeThunk
+                    UbsanRuntimeThunk
+                    SancovRuntimeThunk
+                    SanitizerRuntimeThunk
         CFLAGS ${ASAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
         DEFS ${ASAN_COMMON_DEFINITIONS}
         PARENT_TARGET asan)
+
+      # mingw does not support static linkage of the CRT
+      if(NOT MINGW)
+        set(STATIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_STATIC_RUNTIME_THUNK")
+
+        add_compiler_rt_object_libraries(AsanStaticRuntimeThunk
+          ${SANITIZER_COMMON_SUPPORTED_OS}
+          ARCHS ${arch}
+          SOURCES ${ASAN_STATIC_RUNTIME_THUNK_SOURCES}
+          CFLAGS ${ASAN_DYNAMIC_CFLAGS} ${STATIC_RUNTIME_THUNK_CFLAGS}
+          DEFS ${ASAN_DYNAMIC_DEFINITIONS})
+
+        add_compiler_rt_runtime(clang_rt.asan_static_runtime_thunk
+          STATIC
+          ARCHS ${arch}
+          OBJECT_LIBS AsanStaticRuntimeThunk
+                      UbsanRuntimeThunk
+                      SancovRuntimeThunk
+                      SanitizerRuntimeThunk
+          CFLAGS ${ASAN_DYNAMIC_CFLAGS} ${STATIC_RUNTIME_THUNK_CFLAGS}
+          DEFS ${ASAN_DYNAMIC_DEFINITIONS}
+          PARENT_TARGET asan)
+      endif()
     endif()
   endforeach()
 endif()
diff --git a/compiler-rt/lib/asan/asan_flags.cpp b/compiler-rt/lib/asan/asan_flags.cpp
index 23989843323211..56deb1b0d082b8 100644
--- a/compiler-rt/lib/asan/asan_flags.cpp
+++ b/compiler-rt/lib/asan/asan_flags.cpp
@@ -11,14 +11,16 @@
 // ASan flag parsing logic.
 //===----------------------------------------------------------------------===//
 
-#include "asan_activation.h"
 #include "asan_flags.h"
+
+#include "asan_activation.h"
 #include "asan_interface_internal.h"
 #include "asan_stack.h"
 #include "lsan/lsan_common.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_win_interception.h"
 #include "ubsan/ubsan_flags.h"
 #include "ubsan/ubsan_platform.h"
 
@@ -47,7 +49,21 @@ static void RegisterAsanFlags(FlagParser *parser, Flags *f) {
 #undef ASAN_FLAG
 }
 
-void InitializeFlags() {
+static void DisplayHelpMessages(FlagParser *parser) {
+  // TODO(eugenis): dump all flags at verbosity>=2?
+  if (Verbosity()) {
+    ReportUnrecognizedFlags();
+  }
+
+  if (common_flags()->help) {
+    parser->PrintFlagDescriptions();
+  }
+}
+
+static void InitializeDefaultFlags() {
+  Flags *f = flags();
+  FlagParser asan_parser;
+
   // Set the default values and prepare for parsing ASan and common flags.
   SetCommonFlagsDefaults();
   {
@@ -60,10 +76,8 @@ void InitializeFlags() {
     cf.exitcode = 1;
     OverrideCommonFlags(cf);
   }
-  Flags *f = flags();
   f->SetDefaults();
 
-  FlagParser asan_parser;
   RegisterAsanFlags(&asan_parser, f);
   RegisterCommonFlags(&asan_parser);
 
@@ -126,13 +140,12 @@ void InitializeFlags() {
 
   InitializeCommonFlags();
 
-  // TODO(eugenis): dump all flags at verbosity>=2?
-  if (Verbosity()) ReportUnrecognizedFlags();
+  // TODO(samsonov): print all of the flags (ASan, LSan, common).
+  DisplayHelpMessages(&asan_parser);
+}
 
-  if (common_flags()->help) {
-    // TODO(samsonov): print all of the flags (ASan, LSan, common).
-    asan_parser.PrintFlagDescriptions();
-  }
+static void ProcessFlags() {
+  Flags *f = flags();
 
   // Flag validation:
   if (!CAN_SANITIZE_LEAKS && common_flags()->detect_leaks) {
@@ -199,6 +212,67 @@ void InitializeFlags() {
   }
 }
 
+void InitializeFlags() {
+  InitializeDefaultFlags();
+  ProcessFlags();
+
+#if SANITIZER_WINDOWS
+  // On Windows, weak symbols are emulated by having the user program
+  // register which weak functions are defined.
+  // The ASAN DLL will initialize flags prior to user module initialization,
+  // so __asan_default_options will not point to the user definition yet.
+  // We still want to ensure we capture when options are passed via
+  // __asan_default_options, so we add a callback to be run
+  // when it is registered with the runtime.
+
+  // There is theoretically time between the initial ProcessFlags and
+  // registering the weak callback where a weak function could be added and we
+  // would miss it, but in practice, InitializeFlags will always happen under
+  // the loader lock (if built as a DLL) and so will any calls to
+  // __sanitizer_register_weak_function.
+  AddRegisterWeakFunctionCallback(
+      reinterpret_cast<uptr>(__asan_default_options), []() {
+        FlagParser asan_parser;
+
+        RegisterAsanFlags(&asan_parser, flags());
+        RegisterCommonFlags(&asan_parser);
+        asan_parser.ParseString(__asan_default_options());
+
+        DisplayHelpMessages(&asan_parser);
+        ProcessFlags();
+      });
+
+#  if CAN_SANITIZE_UB
+  AddRegisterWeakFunctionCallback(
+      reinterpret_cast<uptr>(__ubsan_default_options), []() {
+        FlagParser ubsan_parser;
+
+        __ubsan::RegisterUbsanFlags(&ubsan_parser, __ubsan::flags());
+        RegisterCommonFlags(&ubsan_parser);
+        ubsan_parser.ParseString(__ubsan_default_options());
+
+        // To match normal behavior, do not print UBSan help.
+        ProcessFlags();
+      });
+#  endif
+
+#  if CAN_SANITIZE_LEAKS
+  AddRegisterWeakFunctionCallback(
+      reinterpret_cast<uptr>(__lsan_default_options), []() {
+        FlagParser lsan_parser;
+
+        __lsan::RegisterLsanFlags(&lsan_parser, __lsan::flags());
+        RegisterCommonFlags(&lsan_parser);
+        lsan_parser.ParseString(__lsan_default_options());
+
+        // To match normal behavior, do not print LSan help.
+        ProcessFlags();
+      });
+#  endif
+
+#endif
+}
+
 }  // namespace __asan
 
 SANITIZER_INTERFACE_WEAK_DEF(const char*, __asan_default_options, void) {
diff --git a/compiler-rt/lib/asan/asan_globals_win.cpp b/compiler-rt/lib/asan/asan_globals_win.cpp
index 19af88ab12b40a..8267f07b9cce49 100644
--- a/compiler-rt/lib/asan/asan_globals_win.cpp
+++ b/compiler-rt/lib/asan/asan_globals_win.cpp
@@ -28,7 +28,9 @@ static void call_on_globals(void (*hook)(__asan_global *, uptr)) {
   __asan_global *end = &__asan_globals_end;
   uptr bytediff = (uptr)end - (uptr)start;
   if (bytediff % sizeof(__asan_global) != 0) {
-#if defined(SANITIZER_DLL_THUNK) || defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
+#  if defined(SANITIZER_DLL_THUNK) ||             \
+      defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
+      defined(SANITIZER_STATIC_RUNTIME_THUNK)
     __debugbreak();
 #else
     CHECK("corrupt asan global array");
diff --git a/compiler-rt/lib/asan/asan_malloc_win.cpp b/compiler-rt/lib/asan/asan_malloc_win.cpp
index 7e1d04c36dd580..3278f072198769 100644
--- a/compiler-rt/lib/asan/asan_malloc_win.cpp
+++ b/compiler-rt/lib/asan/asan_malloc_win.cpp
@@ -58,97 +58,69 @@ using namespace __asan;
 // MD: Memory allocation functions are defined in the CRT .dll,
 // so we have to intercept them before they are called for the first time.
 
-#if ASAN_DYNAMIC
-# define ALLOCATION_FUNCTION_ATTRIBUTE
-#else
-# define ALLOCATION_FUNCTION_ATTRIBUTE SANITIZER_INTERFACE_ATTRIBUTE
-#endif
-
 extern "C" {
-ALLOCATION_FUNCTION_ATTRIBUTE
-size_t _msize(void *ptr) {
+__declspec(noinline) size_t _msize(void *ptr) {
   GET_CURRENT_PC_BP_SP;
   (void)sp;
   return asan_malloc_usable_size(ptr, pc, bp);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-size_t _msize_base(void *ptr) {
-  return _msize(ptr);
-}
+__declspec(noinline) size_t _msize_base(void *ptr) { return _msize(ptr); }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void free(void *ptr) {
+__declspec(noinline) void free(void *ptr) {
   GET_STACK_TRACE_FREE;
   return asan_free(ptr, &stack, FROM_MALLOC);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void _free_dbg(void *ptr, int) {
-  free(ptr);
-}
+__declspec(noinline) void _free_dbg(void *ptr, int) { free(ptr); }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void _free_base(void *ptr) {
-  free(ptr);
-}
+__declspec(noinline) void _free_base(void *ptr) { free(ptr); }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *malloc(size_t size) {
+__declspec(noinline) void *malloc(size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_malloc(size, &stack);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_malloc_base(size_t size) {
-  return malloc(size);
-}
+__declspec(noinline) void *_malloc_base(size_t size) { return malloc(size); }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_malloc_dbg(size_t size, int, const char *, int) {
+__declspec(noinline) void *_malloc_dbg(size_t size, int, const char *, int) {
   return malloc(size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *calloc(size_t nmemb, size_t size) {
+__declspec(noinline) void *calloc(size_t nmemb, size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_calloc(nmemb, size, &stack);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_calloc_base(size_t nmemb, size_t size) {
+__declspec(noinline) void *_calloc_base(size_t nmemb, size_t size) {
   return calloc(nmemb, size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_calloc_dbg(size_t nmemb, size_t size, int, const char *, int) {
+__declspec(noinline) void *_calloc_dbg(size_t nmemb, size_t size, int,
+                                       const char *, int) {
   return calloc(nmemb, size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_calloc_impl(size_t nmemb, size_t size, int *errno_tmp) {
+__declspec(noinline) void *_calloc_impl(size_t nmemb, size_t size,
+                                        int *errno_tmp) {
   return calloc(nmemb, size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *realloc(void *ptr, size_t size) {
+__declspec(noinline) void *realloc(void *ptr, size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_realloc(ptr, size, &stack);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_realloc_dbg(void *ptr, size_t size, int) {
+__declspec(noinline) void *_realloc_dbg(void *ptr, size_t size, int) {
   UNREACHABLE("_realloc_dbg should not exist!");
   return 0;
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_realloc_base(void *ptr, size_t size) {
+__declspec(noinline) void *_realloc_base(void *ptr, size_t size) {
   return realloc(ptr, size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_recalloc(void *p, size_t n, size_t elem_size) {
+__declspec(noinline) void *_recalloc(void *p, size_t n, size_t elem_size) {
   if (!p)
     return calloc(n, elem_size);
   const size_t size = n * elem_size;
@@ -166,23 +138,41 @@ void *_recalloc(void *p, size_t n, size_t elem_size) {
   return new_alloc;
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_recalloc_base(void *p, size_t n, size_t elem_size) {
+__declspec(noinline) void *_recalloc_base(void *p, size_t n, size_t elem_size) {
   return _recalloc(p, n, elem_size);
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_expand(void *memblock, size_t size) {
+__declspec(noinline) void *_expand(void *memblock, size_t size) {
   // _expand is used in realloc-like functions to resize the buffer if possible.
   // We don't want memory to stand still while resizing buffers, so return 0.
   return 0;
 }
 
-ALLOCATION_FUNCTION_ATTRIBUTE
-void *_expand_dbg(void *memblock, size_t size) {
+__declspec(noinline) void *_expand_dbg(void *memblock, size_t size) {
   return _expand(memblock, size);
 }
 
+__declspec(dllexport) size_t __cdecl __asan_msize(void *ptr) {
+  return _msize(ptr);
+}
+__declspec(dllexport) void __cdecl __asan_free(void *const ptr) { free(ptr); }
+__declspec(dllexport) void *__cdecl __asan_malloc(const size_t size) {
+  return malloc(size);
+}
+__declspec(dllexport) void *__cdecl __asan_calloc(const size_t nmemb,
+                                                  const size_t size) {
+  return calloc(nmemb, size);
+}
+__declspec(dllexport) void *__cdecl __asan_realloc(void *const ptr,
+                                                   const size_t size) {
+  return realloc(ptr, size);
+}
+__declspec(dllexport) void *__cdecl __asan_recalloc(void *const ptr,
+                                                    const size_t nmemb,
+                                                    const size_t size) {
+  return _recalloc(ptr, nmemb, size);
+}
+
 // TODO(timurrrr): Might want to add support for _aligned_* allocation
 // functions to detect a bit more bugs.  Those functions seem to wrap malloc().
 
@@ -487,7 +477,6 @@ static void TryToOverrideFunction(const char *fname, uptr new_func) {
 }
 
 void ReplaceSystemMalloc() {
-#if defined(ASAN_DYNAMIC)
   TryToOverrideFunction("free", (uptr)free);
   TryToOverrideFunction("_free_base", (uptr)free);
   TryToOverrideFunction("malloc", (uptr)malloc);
@@ -543,8 +532,6 @@ void ReplaceSystemMalloc() {
   // allocation API will be directed to ASan's heap. We don't currently
   // intercept all calls to HeapAlloc. If we did, we would have to check on
   // HeapFree whether the pointer came from ASan of from the system.
-
-#endif  // defined(ASAN_DYNAMIC)
 }
 }  // namespace __asan
 
diff --git a/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp b/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
new file mode 100644
index 00000000000000..abf515b77c4a9f
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
@@ -0,0 +1,229 @@
+//===-- asan_malloc_win_thunk.cpp
+//-----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Windows-specific malloc interception.
+// This is included statically for projects statically linking
+// with the C Runtime (/MT, /MTd) in order to provide ASAN-aware
+// versions of the C allocation functions.
+//===----------------------------------------------------------------------===//
+
+#ifdef SANITIZER_STATIC_RUNTIME_THUNK
+#  include "..\sanitizer_common\sanitizer_allocator_interface.h"
+// #include "asan_win_thunk_common.h"
+
+// Preserve stack traces with noinline.
+#  define STATIC_MALLOC_INTERFACE __declspec(noinline)
+
+extern "C" {
+__declspec(dllimport) size_t __cdecl __asan_msize(void *ptr);
+__declspec(dllimport) void __cdecl __asan_free(void *const ptr);
+__declspec(dllimport) void *__cdecl __asan_malloc(const size_t size);
+__declspec(dllimport) void *__cdecl __asan_calloc(const size_t nmemb,
+                                                  const size_t size);
+__declspec(dllimport) void *__cdecl __asan_realloc(void *const ptr,
+                                                   const size_t size);
+__declspec(dllimport) void *__cdecl __asan_recalloc(void *const ptr,
+                                                    const size_t nmemb,
+                                                    const size_t size);
+
+// Avoid tailcall optimization to preserve stack frames.
+#  pragma optimize("", off)
+
+// _msize
+STATIC_MALLOC_INTERFACE size_t _msize(void *ptr) { return __asan_msize(ptr); }
+
+STATIC_MALLOC_INTERFACE size_t _msize_base(void *ptr) {
+  return __asan_msize(ptr);
+}
+
+STATIC_MALLOC_INTERFACE size_t _msize_dbg(void *ptr) {
+  return __asan_msize(ptr);
+}
+
+// free
+STATIC_MALLOC_INTERFACE void free(void *const ptr) { return __asan_free(ptr); }
+
+STATIC_MALLOC_INTERFACE void _free_base(void *const ptr) {
+  return __asan_free(ptr);
+}
+
+STATIC_MALLOC_INTERFACE void _free_dbg(void *const ptr) {
+  return __asan_free(ptr);
+}
+
+// malloc
+STATIC_MALLOC_INTERFACE void *malloc(const size_t size) {
+  return __asan_malloc(size);
+}
+
+STATIC_MALLOC_INTERFACE void *_malloc_base(const size_t size) {
+  return __asan_malloc(size);
+}
+
+STATIC_MALLOC_INTERFACE void *_malloc_dbg(const size_t size) {
+  return __asan_malloc(size);
+}
+
+// calloc
+STATIC_MALLOC_INTERFACE void *calloc(const size_t nmemb, const size_t size) {
+  return __asan_calloc(nmemb, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_calloc_base(const size_t nmemb,
+                                           const size_t size) {
+  return __asan_calloc(nmemb, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_calloc_impl(const size_t nmemb,
+                                           const size_t size,
+                                           int *const errno_tmp) {
+  // Provided by legacy msvcrt.
+  (void)errno_tmp;
+
+  return __asan_calloc(nmemb, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_calloc_dbg(const size_t nmemb, const size_t size,
+                                          int, const char *, int) {
+  return __asan_calloc(nmemb, size);
+}
+
+// realloc
+STATIC_MALLOC_INTERFACE void *realloc(void *const ptr, const size_t size) {
+  return __asan_realloc(ptr, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_realloc_base(void *const ptr,
+                                            const size_t size) {
+  return __asan_realloc(ptr, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_realloc_dbg(void *const ptr, const size_t size,
+                                           int, const char *, int) {
+  return __asan_realloc(ptr, size);
+}
+
+// recalloc
+STATIC_MALLOC_INTERFACE void *_recalloc(void *const ptr, const size_t nmemb,
+                                        const size_t size) {
+  return __asan_recalloc(ptr, nmemb, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_recalloc_base(void *const ptr,
+                                             const size_t nmemb,
+                                             const size_t size) {
+  return __asan_recalloc(ptr, nmemb, size);
+}
+
+STATIC_MALLOC_INTERFACE void *_recalloc_dbg(void *const ptr, const size_t nmemb,
+                                            const size_t size, int,
+                                            const char *, int) {
+  return __asan_recalloc(ptr, nmemb, size);
+}
+
+// expand
+STATIC_MALLOC_INTERFACE void *_expand(void *, size_t) {
+  // _expand is used in realloc-like functions to resize the buffer if possible.
+  // We don't want memory to stand still while resizing buffers, so return 0.
+  return nullptr;
+}
+
+STATIC_MALLOC_INTERFACE void *_expand_dbg(void *, size_t, int, const char *,
+                                          int) {
+  return nullptr;
+}
+
+// We need to provide symbols for all the debug CRT functions if we decide to
+// provide any. Most of these functions make no sense under ASan and so we
+// make them no-ops.
+long _CrtSetBreakAlloc(long const) { return ~0; }
+
+void _CrtSetDbgBlockType(void *const, int const) { return; }
+
+typedef int(__cdecl *CRT_ALLOC_HOOK)(int, void *, size_t, int, long,
+                                     const unsigned char *, int);
+
+CRT_ALLOC_HOOK _CrtGetAllocHook() { return nullptr; }
+
+CRT_ALLOC_HOOK _CrtSetAllocHook(CRT_ALLOC_HOOK const hook) { return hook; }
+
+int _CrtCheckMemory() { return 1; }
+
+int _CrtSetDbgFlag(int const new_bits) { return new_bits; }
+
+typedef void (*CrtDoForAllClientObjectsCallback)(void *, void *);
+
+void _CrtDoForAllClientObjects(CrtDoForAllClientObjectsCallback const,
+                               void *const) {
+  return;
+}
+
+int _CrtIsValidPointer(void const *const p, unsigned int const, int const) {
+  return p != nullptr;
+}
+
+int _CrtIsValidHeapPointer(void const *const block) {
+  if (!block) {
+    return 0;
+  }
+
+  return __sanitizer_get_ownership(block);
+}
+
+int _CrtIsMemoryBlock(void const *const, unsigned const, long *const,
+                      char **const, int *const) {
+  return 0;
+}
+
+int _CrtReportBlockType(void const *const) { return -1; }
+
+typedef void(__cdecl *CRT_DUMP_CLIENT)(void *, size_t);
+
+CRT_DUMP_CLIENT _CrtGetDumpClient() { return nullptr; }
+
+CRT_DUMP_CLIENT _CrtSetDumpClient(CRT_DUMP_CLIENT new_client) {
+  return new_client;
+}
+
+void _CrtMemCheckpoint(void *const) { return; }
+
+int _CrtMemDifference(void *const, void const *const, void const *const) {
+  return 0;
+}
+
+void _CrtMemDumpAllObjectsSince(void const *const) { return; }
+
+int _CrtDumpMemoryLeaks() { return 0; }
+
+void _CrtMemDumpStatistics(void const *const) { return; }
+
+int _crtDbgFlag{0};
+long _crtBreakAlloc{-1};
+CRT_DUMP_CLIENT _pfnDumpClient{nullptr};
+
+int *__p__crtDbgFlag() { return &_crtDbgFlag; }
+
+long *__p__crtBreakAlloc() { return &_crtBreakAlloc; }
+
+// TODO: These were added upstream but conflict with definitions in ucrtbased.
+// int _CrtDbgReport(int, const char *, int, const char *, const char *, ...) {
+//   ShowStatsAndAbort();
+// }
+//
+// int _CrtDbgReportW(int reportType, const wchar_t *, int, const wchar_t *,
+//                    const wchar_t *, ...) {
+//   ShowStatsAndAbort();
+// }
+//
+// int _CrtSetReportMode(int, int) { return 0; }
+
+}  // extern "C"
+#endif  // SANITIZER_STATIC_RUNTIME_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
new file mode 100644
index 00000000000000..d2c9e66c313379
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
@@ -0,0 +1,112 @@
+//===-- asan_win_common_runtime_thunk.cpp --------------------------- -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// This file defines things that need to be present in the application modules
+// to interact with the ASan DLL runtime correctly and can't be implemented
+// using the default "import library" generated when linking the DLL.
+//
+// This includes:
+//  - Cloning shadow memory dynamic address from ASAN DLL
+//  - Creating weak aliases to default implementation imported from asan dll
+//  - Forwarding the detect_stack_use_after_return runtime option
+//  - installing a custom SEH handler
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
+    defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#  define SANITIZER_IMPORT_INTERFACE 1
+#  define WIN32_LEAN_AND_MEAN
+#  include "asan_win_common_runtime_thunk.h"
+
+#  include <windows.h>
+
+#  include "sanitizer_common/sanitizer_win_defs.h"
+#  include "sanitizer_common/sanitizer_win_thunk_interception.h"
+
+// Define weak alias for all weak functions imported from asan dll.
+#  define INTERFACE_FUNCTION(Name)
+#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
+#  include "asan_interface.inc"
+
+////////////////////////////////////////////////////////////////////////////////
+// Define a copy of __asan_option_detect_stack_use_after_return that should be
+// used when linking an MD runtime with a set of object files on Windows.
+//
+// The ASan MD runtime dllexports '__asan_option_detect_stack_use_after_return',
+// so normally we would just dllimport it.  Unfortunately, the dllimport
+// attribute adds __imp_ prefix to the symbol name of a variable.
+// Since in general we don't know if a given TU is going to be used
+// with a MT or MD runtime and we don't want to use ugly __imp_ names on Windows
+// just to work around this issue, let's clone the variable that is constant
+// after initialization anyways.
+
+extern "C" {
+__declspec(dllimport) int __asan_should_detect_stack_use_after_return();
+int __asan_option_detect_stack_use_after_return;
+
+__declspec(dllimport) void *__asan_get_shadow_memory_dynamic_address();
+void *__asan_shadow_memory_dynamic_address;
+
+static void __asan_initialize_cloned_variables() {
+  __asan_option_detect_stack_use_after_return =
+      __asan_should_detect_stack_use_after_return();
+  __asan_shadow_memory_dynamic_address =
+      __asan_get_shadow_memory_dynamic_address();
+}
+}
+
+static int asan_thunk_init() {
+  __asan_initialize_cloned_variables();
+
+#  ifdef SANITIZER_STATIC_RUNTIME_THUNK
+  __asan_initialize_static_thunk();
+#  endif
+
+  return 0;
+}
+
+static void WINAPI asan_thread_init(void *mod, unsigned long reason,
+                                    void *reserved) {
+  if (reason == DLL_PROCESS_ATTACH) {
+    asan_thunk_init();
+  }
+}
+
+// Our cloned variables must be initialized before C/C++ constructors.  If TLS
+// is used, our .CRT$XLAB initializer will run first. If not, our .CRT$XIB
+// initializer is needed as a backup.
+extern "C" __declspec(allocate(".CRT$XIB")) int (*__asan_thunk_init)() =
+    asan_thunk_init;
+WIN_FORCE_LINK(__asan_thunk_init);
+
+extern "C" __declspec(allocate(".CRT$XLAB")) void(WINAPI *__asan_tls_init)(
+    void *, unsigned long, void *) = asan_thread_init;
+WIN_FORCE_LINK(__asan_tls_init);
+
+////////////////////////////////////////////////////////////////////////////////
+// ASan SEH handling.
+// We need to set the ASan-specific SEH handler at the end of CRT initialization
+// of each module (see also asan_win.cpp).
+extern "C" {
+__declspec(dllimport) int __asan_set_seh_filter();
+static int SetSEHFilter() { return __asan_set_seh_filter(); }
+
+// Unfortunately, putting a pointer to __asan_set_seh_filter into
+// __asan_intercept_seh gets optimized out, so we have to use an extra function.
+extern "C" __declspec(allocate(".CRT$XCAB")) int (*__asan_seh_interceptor)() =
+    SetSEHFilter;
+WIN_FORCE_LINK(__asan_seh_interceptor);
+}
+
+WIN_FORCE_LINK(__asan_dso_reg_hook)
+
+#endif  // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
+        // defined(SANITIZER_STATIC_RUNTIME_THUNK)
diff --git a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
new file mode 100644
index 00000000000000..66285eb31ae994
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
@@ -0,0 +1,38 @@
+//===-- asan_win_common_runtime_thunk.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// This file defines things that need to be present in the application modules
+// to interact with the ASan DLL runtime correctly and can't be implemented
+// using the default "import library" generated when linking the DLL.
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(SANITIZER_STATIC_RUNTIME_THUNK) || \
+    defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
+#  include "sanitizer_common/sanitizer_win_defs.h"
+
+#  pragma section(".CRT$XIB", long, \
+                  read)  // C initializer (during C init before dyninit)
+#  pragma section(".CRT$XID", long, \
+                  read)  // First C initializer after CRT initializers
+#  pragma section(".CRT$XCAB", long, \
+                  read)  // First C++ initializer after startup initializers
+
+#  pragma section(".CRT$XTW", long, read)  // First ASAN globals terminator
+#  pragma section(".CRT$XTY", long, read)  // Last ASAN globals terminator
+
+#  pragma section(".CRT$XLAB", long, read)  // First TLS initializer
+
+#  ifdef SANITIZER_STATIC_RUNTIME_THUNK
+extern "C" void __asan_initialize_static_thunk();
+#  endif
+
+#endif  // defined(SANITIZER_STATIC_RUNTIME_THUNK) ||
+        // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
\ No newline at end of file
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
deleted file mode 100644
index 35871a942a7a12..00000000000000
--- a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//===-- asan_win_dll_thunk.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-//
-// This file defines a family of thunks that should be statically linked into
-// the DLLs that have ASan instrumentation in order to delegate the calls to the
-// shared runtime that lives in the main binary.
-// See https://github.com/google/sanitizers/issues/209 for the details.
-//===----------------------------------------------------------------------===//
-
-#ifdef SANITIZER_DLL_THUNK
-#include "asan_init_version.h"
-#include "interception/interception.h"
-#include "sanitizer_common/sanitizer_win_defs.h"
-#include "sanitizer_common/sanitizer_win_dll_thunk.h"
-#include "sanitizer_common/sanitizer_platform_interceptors.h"
-
-// ASan own interface functions.
-#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "asan_interface.inc"
-
-// Memory allocation functions.
-INTERCEPT_WRAP_V_W(free)
-INTERCEPT_WRAP_V_W(_free_base)
-INTERCEPT_WRAP_V_WW(_free_dbg)
-
-INTERCEPT_WRAP_W_W(malloc)
-INTERCEPT_WRAP_W_W(_malloc_base)
-INTERCEPT_WRAP_W_WWWW(_malloc_dbg)
-
-INTERCEPT_WRAP_W_WW(calloc)
-INTERCEPT_WRAP_W_WW(_calloc_base)
-INTERCEPT_WRAP_W_WWWWW(_calloc_dbg)
-INTERCEPT_WRAP_W_WWW(_calloc_impl)
-
-INTERCEPT_WRAP_W_WW(realloc)
-INTERCEPT_WRAP_W_WW(_realloc_base)
-INTERCEPT_WRAP_W_WWW(_realloc_dbg)
-INTERCEPT_WRAP_W_WWW(_recalloc)
-INTERCEPT_WRAP_W_WWW(_recalloc_base)
-
-INTERCEPT_WRAP_W_W(_msize)
-INTERCEPT_WRAP_W_W(_msize_base)
-INTERCEPT_WRAP_W_W(_expand)
-INTERCEPT_WRAP_W_W(_expand_dbg)
-
-// TODO(timurrrr): Might want to add support for _aligned_* allocation
-// functions to detect a bit more bugs.  Those functions seem to wrap malloc().
-
-// TODO(timurrrr): Do we need to add _Crt* stuff here? (see asan_malloc_win.cpp)
-
-#  if defined(_MSC_VER) && !defined(__clang__)
-// Disable warnings such as: 'void memchr(void)': incorrect number of arguments
-// for intrinsic function, expected '3' arguments.
-#    pragma warning(push)
-#    pragma warning(disable : 4392)
-#  endif
-
-INTERCEPT_LIBRARY_FUNCTION(atoi);
-INTERCEPT_LIBRARY_FUNCTION(atol);
-INTERCEPT_LIBRARY_FUNCTION(atoll);
-INTERCEPT_LIBRARY_FUNCTION(frexp);
-INTERCEPT_LIBRARY_FUNCTION(longjmp);
-#if SANITIZER_INTERCEPT_MEMCHR
-INTERCEPT_LIBRARY_FUNCTION(memchr);
-#endif
-INTERCEPT_LIBRARY_FUNCTION(memcmp);
-INTERCEPT_LIBRARY_FUNCTION(memcpy);
-INTERCEPT_LIBRARY_FUNCTION(memmove);
-INTERCEPT_LIBRARY_FUNCTION(memset);
-INTERCEPT_LIBRARY_FUNCTION(strcat);
-INTERCEPT_LIBRARY_FUNCTION(strchr);
-INTERCEPT_LIBRARY_FUNCTION(strcmp);
-INTERCEPT_LIBRARY_FUNCTION(strcpy);
-INTERCEPT_LIBRARY_FUNCTION(strcspn);
-INTERCEPT_LIBRARY_FUNCTION(_strdup);
-INTERCEPT_LIBRARY_FUNCTION(strlen);
-INTERCEPT_LIBRARY_FUNCTION(strncat);
-INTERCEPT_LIBRARY_FUNCTION(strncmp);
-INTERCEPT_LIBRARY_FUNCTION(strncpy);
-INTERCEPT_LIBRARY_FUNCTION(strnlen);
-INTERCEPT_LIBRARY_FUNCTION(strpbrk);
-INTERCEPT_LIBRARY_FUNCTION(strrchr);
-INTERCEPT_LIBRARY_FUNCTION(strspn);
-INTERCEPT_LIBRARY_FUNCTION(strstr);
-INTERCEPT_LIBRARY_FUNCTION(strtok);
-INTERCEPT_LIBRARY_FUNCTION(strtol);
-INTERCEPT_LIBRARY_FUNCTION(strtoll);
-INTERCEPT_LIBRARY_FUNCTION(wcslen);
-INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
-
-#  if defined(_MSC_VER) && !defined(__clang__)
-#    pragma warning(pop)
-#  endif
-
-#ifdef _WIN64
-INTERCEPT_LIBRARY_FUNCTION(__C_specific_handler);
-#else
-INTERCEPT_LIBRARY_FUNCTION(_except_handler3);
-// _except_handler4 checks -GS cookie which is different for each module, so we
-// can't use INTERCEPT_LIBRARY_FUNCTION(_except_handler4).
-INTERCEPTOR(int, _except_handler4, void *a, void *b, void *c, void *d) {
-  __asan_handle_no_return();
-  return REAL(_except_handler4)(a, b, c, d);
-}
-#endif
-
-// Windows specific functions not included in asan_interface.inc.
-INTERCEPT_WRAP_W_V(__asan_should_detect_stack_use_after_return)
-INTERCEPT_WRAP_W_V(__asan_get_shadow_memory_dynamic_address)
-INTERCEPT_WRAP_W_W(__asan_unhandled_exception_filter)
-
-using namespace __sanitizer;
-
-extern "C" {
-int __asan_option_detect_stack_use_after_return;
-uptr __asan_shadow_memory_dynamic_address;
-} // extern "C"
-
-static int asan_dll_thunk_init() {
-  typedef void (*fntype)();
-  static fntype fn = 0;
-  // asan_dll_thunk_init is expected to be called by only one thread.
-  if (fn) return 0;
-
-  // Ensure all interception was executed.
-  __dll_thunk_init();
-
-  fn = (fntype) dllThunkGetRealAddrOrDie("__asan_init");
-  fn();
-  __asan_option_detect_stack_use_after_return =
-      (__asan_should_detect_stack_use_after_return() != 0);
-  __asan_shadow_memory_dynamic_address =
-      (uptr)__asan_get_shadow_memory_dynamic_address();
-
-#ifndef _WIN64
-  INTERCEPT_FUNCTION(_except_handler4);
-#endif
-  // In DLLs, the callbacks are expected to return 0,
-  // otherwise CRT initialization fails.
-  return 0;
-}
-
-#pragma section(".CRT$XIB", long, read)
-__declspec(allocate(".CRT$XIB")) int (*__asan_preinit)() = asan_dll_thunk_init;
-
-static void WINAPI asan_thread_init(void *mod, unsigned long reason,
-                                    void *reserved) {
-  if (reason == /*DLL_PROCESS_ATTACH=*/1) asan_dll_thunk_init();
-}
-
-#pragma section(".CRT$XLAB", long, read)
-__declspec(allocate(".CRT$XLAB")) void (WINAPI *__asan_tls_init)(void *,
-    unsigned long, void *) = asan_thread_init;
-
-WIN_FORCE_LINK(__asan_dso_reg_hook)
-
-#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
index f0b5ec9eef7f99..421fe651b7d919 100644
--- a/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
@@ -8,76 +8,17 @@
 //
 // This file is a part of AddressSanitizer, an address sanity checker.
 //
-// This file defines things that need to be present in the application modules
-// to interact with the ASan DLL runtime correctly and can't be implemented
-// using the default "import library" generated when linking the DLL RTL.
-//
-// This includes:
-//  - creating weak aliases to default implementation imported from asan dll.
-//  - forwarding the detect_stack_use_after_return runtime option
-//  - working around deficiencies of the MD runtime
-//  - installing a custom SEH handler
+// This file defines things that need to be present for application modules
+// that are dynamic linked with the C Runtime.
 //
 //===----------------------------------------------------------------------===//
 
 #ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
-#define SANITIZER_IMPORT_INTERFACE 1
-#include "sanitizer_common/sanitizer_win_defs.h"
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-
-// Define weak alias for all weak functions imported from asan dll.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
-#include "asan_interface.inc"
-
-// First, declare CRT sections we'll be using in this file
-#pragma section(".CRT$XIB", long, read)
-#pragma section(".CRT$XID", long, read)
-#pragma section(".CRT$XCAB", long, read)
-#pragma section(".CRT$XTW", long, read)
-#pragma section(".CRT$XTY", long, read)
-#pragma section(".CRT$XLAB", long, read)
-
-////////////////////////////////////////////////////////////////////////////////
-// Define a copy of __asan_option_detect_stack_use_after_return that should be
-// used when linking an MD runtime with a set of object files on Windows.
-//
-// The ASan MD runtime dllexports '__asan_option_detect_stack_use_after_return',
-// so normally we would just dllimport it.  Unfortunately, the dllimport
-// attribute adds __imp_ prefix to the symbol name of a variable.
-// Since in general we don't know if a given TU is going to be used
-// with a MT or MD runtime and we don't want to use ugly __imp_ names on Windows
-// just to work around this issue, let's clone the variable that is constant
-// after initialization anyways.
-extern "C" {
-__declspec(dllimport) int __asan_should_detect_stack_use_after_return();
-int __asan_option_detect_stack_use_after_return;
-
-__declspec(dllimport) void* __asan_get_shadow_memory_dynamic_address();
-void* __asan_shadow_memory_dynamic_address;
-}
-
-static int InitializeClonedVariables() {
-  __asan_option_detect_stack_use_after_return =
-    __asan_should_detect_stack_use_after_return();
-  __asan_shadow_memory_dynamic_address =
-    __asan_get_shadow_memory_dynamic_address();
-  return 0;
-}
-
-static void NTAPI asan_thread_init(void *mod, unsigned long reason,
-    void *reserved) {
-  if (reason == DLL_PROCESS_ATTACH) InitializeClonedVariables();
-}
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
 
-// Our cloned variables must be initialized before C/C++ constructors.  If TLS
-// is used, our .CRT$XLAB initializer will run first. If not, our .CRT$XIB
-// initializer is needed as a backup.
-__declspec(allocate(".CRT$XIB")) int (*__asan_initialize_cloned_variables)() =
-    InitializeClonedVariables;
-__declspec(allocate(".CRT$XLAB")) void (NTAPI *__asan_tls_init)(void *,
-    unsigned long, void *) = asan_thread_init;
+#  include "asan_win_common_runtime_thunk.h"
+#  include "sanitizer_common/sanitizer_win_defs.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // For some reason, the MD CRT doesn't call the C/C++ terminators during on DLL
@@ -88,43 +29,26 @@ __declspec(allocate(".CRT$XLAB")) void (NTAPI *__asan_tls_init)(void *,
 // using atexit() that calls a small subset of C terminators
 // where LLVM global_dtors is placed.  Fingers crossed, no other C terminators
 // are there.
-extern "C" int __cdecl atexit(void (__cdecl *f)(void));
+extern "C" int __cdecl atexit(void(__cdecl *f)(void));
 extern "C" void __cdecl _initterm(void *a, void *b);
 
 namespace {
-__declspec(allocate(".CRT$XTW")) void* before_global_dtors = 0;
-__declspec(allocate(".CRT$XTY")) void* after_global_dtors = 0;
+__declspec(allocate(".CRT$XTW")) void *before_global_dtors = 0;
+__declspec(allocate(".CRT$XTY")) void *after_global_dtors = 0;
 
 void UnregisterGlobals() {
   _initterm(&before_global_dtors, &after_global_dtors);
 }
 
-int ScheduleUnregisterGlobals() {
-  return atexit(UnregisterGlobals);
-}
+int ScheduleUnregisterGlobals() { return atexit(UnregisterGlobals); }
 }  // namespace
 
 // We need to call 'atexit(UnregisterGlobals);' as early as possible, but after
 // atexit() is initialized (.CRT$XIC).  As this is executed before C++
 // initializers (think ctors for globals), UnregisterGlobals gets executed after
 // dtors for C++ globals.
-__declspec(allocate(".CRT$XID"))
-int (*__asan_schedule_unregister_globals)() = ScheduleUnregisterGlobals;
-
-////////////////////////////////////////////////////////////////////////////////
-// ASan SEH handling.
-// We need to set the ASan-specific SEH handler at the end of CRT initialization
-// of each module (see also asan_win.cpp).
-extern "C" {
-__declspec(dllimport) int __asan_set_seh_filter();
-static int SetSEHFilter() { return __asan_set_seh_filter(); }
-
-// Unfortunately, putting a pointer to __asan_set_seh_filter into
-// __asan_intercept_seh gets optimized out, so we have to use an extra function.
-__declspec(allocate(".CRT$XCAB")) int (*__asan_seh_interceptor)() =
-    SetSEHFilter;
-}
-
-WIN_FORCE_LINK(__asan_dso_reg_hook)
+extern "C" __declspec(allocate(".CRT$XID")) int (
+    *__asan_schedule_unregister_globals)() = ScheduleUnregisterGlobals;
+WIN_FORCE_LINK(__asan_schedule_unregister_globals)
 
-#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
+#endif  // SANITIZER_DYNAMIC_RUNTIME_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
new file mode 100644
index 00000000000000..dec50a5e1d4d9e
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
@@ -0,0 +1,110 @@
+//===-- asan_win_static_runtime_thunk.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// This file defines a family of thunks that should be statically linked into
+// modules that are statically linked with the C Runtime in order to delegate
+// the calls to the ASAN runtime DLL.
+// See https://github.com/google/sanitizers/issues/209 for the details.
+//===----------------------------------------------------------------------===//
+
+#ifdef SANITIZER_STATIC_RUNTIME_THUNK
+#  include "asan_init_version.h"
+#  include "asan_interface_internal.h"
+#  include "asan_win_common_runtime_thunk.h"
+#  include "sanitizer_common/sanitizer_platform_interceptors.h"
+#  include "sanitizer_common/sanitizer_win_defs.h"
+#  include "sanitizer_common/sanitizer_win_thunk_interception.h"
+
+#  if defined(_MSC_VER) && !defined(__clang__)
+// Disable warnings such as: 'void memchr(void)': incorrect number of arguments
+// for intrinsic function, expected '3' arguments.
+#    pragma warning(push)
+#    pragma warning(disable : 4392)
+#  endif
+
+#  define INTERCEPT_LIBRARY_FUNCTION_ASAN(X) \
+    INTERCEPT_LIBRARY_FUNCTION(X, "__asan_wrap_" #X)
+
+INTERCEPT_LIBRARY_FUNCTION_ASAN(atoi);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(atol);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(atoll);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(frexp);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(longjmp);
+#  if SANITIZER_INTERCEPT_MEMCHR
+INTERCEPT_LIBRARY_FUNCTION_ASAN(memchr);
+#  endif
+INTERCEPT_LIBRARY_FUNCTION_ASAN(memcmp);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(memcpy);
+#  ifndef _WIN64
+// memmove and memcpy share an implementation on amd64
+INTERCEPT_LIBRARY_FUNCTION_ASAN(memmove);
+#  endif
+INTERCEPT_LIBRARY_FUNCTION_ASAN(memset);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strcat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strchr);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strcmp);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strcpy);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strcspn);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(_strdup);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strlen);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strncat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strncmp);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strncpy);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strnlen);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strpbrk);
+// INTERCEPT_LIBRARY_FUNCTION_ASAN(strrchr);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strspn);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strstr);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strtok);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(strtol);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcslen);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsnlen);
+
+#  if defined(_MSC_VER) && !defined(__clang__)
+#    pragma warning(pop)
+#  endif
+
+#  ifdef _WIN64
+INTERCEPT_LIBRARY_FUNCTION_ASAN(__C_specific_handler);
+#  else
+extern "C" void abort();
+INTERCEPT_LIBRARY_FUNCTION_ASAN(_except_handler3);
+// _except_handler4 checks -GS cookie which is different for each module, so we
+// can't use INTERCEPT_LIBRARY_FUNCTION_ASAN(_except_handler4), need to apply
+// manually
+extern "C" int _except_handler4(void *, void *, void *, void *);
+static int (*real_except_handler4)(void *, void *, void *,
+                                   void *) = &_except_handler4;
+static int intercept_except_handler4(void *a, void *b, void *c, void *d) {
+  __asan_handle_no_return();
+  return real_except_handler4(a, b, c, d);
+}
+#  endif
+
+// Windows specific functions not included in asan_interface.inc.
+// INTERCEPT_WRAP_W_V(__asan_should_detect_stack_use_after_return)
+// INTERCEPT_WRAP_W_V(__asan_get_shadow_memory_dynamic_address)
+// INTERCEPT_WRAP_W_W(__asan_unhandled_exception_filter)
+
+extern "C" void __asan_initialize_static_thunk() {
+#  ifndef _WIN64
+  if (real_except_handler4 == &_except_handler4) {
+    // Single threaded, no need for synchronization.
+    if (!__sanitizer_override_function_by_addr(
+            reinterpret_cast<__sanitizer::uptr>(&intercept_except_handler4),
+            reinterpret_cast<__sanitizer::uptr>(&_except_handler4),
+            reinterpret_cast<__sanitizer::uptr*>(&real_except_handler4))) {
+      abort();
+    }
+  }
+#  endif
+}
+
+#endif  // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index bda47bd7fd6a22..9c1db7caeb7b7d 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -203,7 +203,7 @@ function(add_asan_tests arch test_runtime)
         CFLAGS ${ASAN_UNITTEST_INSTRUMENTED_CFLAGS} -D_MT -D_DLL
         SOURCES ${ASAN_INST_TEST_SOURCES}
         LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS}
-          -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames
+          -D_MT -D_DLL -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames
         )
     else()
 
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e51648917515..ef23492514898b 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -111,6 +111,12 @@ if(COMPILER_RT_TARGET_HAS_UNAME)
      -DCOMPILER_RT_HAS_UNAME=1)
 endif()
 
+if(MSVC)
+  # profile historically has only been supported with the static runtime
+  # on windows
+  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
+endif()
+
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
 # XRay uses C++ standard library headers.
diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 66f2d259aa5fd4..41c3888275a0f2 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SANITIZER_SOURCES_NOTERMINATION
   sanitizer_thread_registry.cpp
   sanitizer_type_traits.cpp
   sanitizer_win.cpp
+  sanitizer_win_interception.cpp
   )
 
 set(SANITIZER_SOURCES
@@ -206,8 +207,8 @@ set(SANITIZER_IMPL_HEADERS
   sanitizer_vector.h
   sanitizer_win.h
   sanitizer_win_defs.h
-  sanitizer_win_dll_thunk.h
-  sanitizer_win_weak_interception.h
+  sanitizer_win_interception.h
+  sanitizer_win_thunk_interception.h
   )
 
 include_directories(..)
@@ -301,57 +302,23 @@ add_compiler_rt_object_libraries(RTSanitizerCommonSymbolizerNoHooks
   DEFS ${SANITIZER_COMMON_DEFINITIONS})
 
 if(WIN32)
-  add_compiler_rt_object_libraries(SanitizerCommonWeakInterception
+  set(RUNTIME_THUNK_CFLAGS -DSANITIZER_DYNAMIC_RUNTIME_THUNK -DSANITIZER_STATIC_RUNTIME_THUNK)
+  append_list_if(MSVC /Zl RUNTIME_THUNK_CFLAGS)
+  add_compiler_rt_object_libraries(SanitizerRuntimeThunk
     ${SANITIZER_COMMON_SUPPORTED_OS}
     ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
     SOURCES
-      sanitizer_win_weak_interception.cpp
-    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DYNAMIC
-    DEFS ${SANITIZER_COMMON_DEFINITIONS})
-  add_compiler_rt_object_libraries(SancovWeakInterception
-    ${SANITIZER_COMMON_SUPPORTED_OS}
-    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
-    SOURCES
-      sanitizer_coverage_win_weak_interception.cpp
-    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DYNAMIC
-    DEFS ${SANITIZER_COMMON_DEFINITIONS})
-
-  add_compiler_rt_object_libraries(SanitizerCommonDllThunk
-    ${SANITIZER_COMMON_SUPPORTED_OS}
-    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
-    SOURCES
-      sanitizer_win_dll_thunk.cpp
-    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DLL_THUNK
-    DEFS ${SANITIZER_COMMON_DEFINITIONS})
-  add_compiler_rt_object_libraries(SancovDllThunk
-    ${SANITIZER_COMMON_SUPPORTED_OS}
-    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
-    SOURCES
-      sanitizer_coverage_win_dll_thunk.cpp
-      sanitizer_coverage_win_sections.cpp
-    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DLL_THUNK
+      sanitizer_win_thunk_interception.cpp
+    CFLAGS ${SANITIZER_CFLAGS} ${RUNTIME_THUNK_CFLAGS}
     DEFS ${SANITIZER_COMMON_DEFINITIONS})
 
-  set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
-  if(MSVC)
-    list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
-  elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
-    list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
-  endif()
-  add_compiler_rt_object_libraries(SanitizerCommonDynamicRuntimeThunk
-    ${SANITIZER_COMMON_SUPPORTED_OS}
-    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
-    SOURCES
-      sanitizer_win_dynamic_runtime_thunk.cpp
-    CFLAGS ${SANITIZER_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
-    DEFS ${SANITIZER_COMMON_DEFINITIONS})
-  add_compiler_rt_object_libraries(SancovDynamicRuntimeThunk
+  add_compiler_rt_object_libraries(SancovRuntimeThunk
     ${SANITIZER_COMMON_SUPPORTED_OS}
     ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
     SOURCES
-      sanitizer_coverage_win_dynamic_runtime_thunk.cpp
+      sanitizer_coverage_win_runtime_thunk.cpp
       sanitizer_coverage_win_sections.cpp
-    CFLAGS ${SANITIZER_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
+    CFLAGS ${SANITIZER_CFLAGS} ${RUNTIME_THUNK_CFLAGS}
     DEFS ${SANITIZER_COMMON_DEFINITIONS})
 endif()
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
index 557207fe62ac6d..11f1d963bd6f43 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
@@ -50,3 +50,9 @@ INTERFACE_WEAK_FUNCTION(__sanitizer_malloc_hook)
 INTERFACE_FUNCTION(__sanitizer_internal_memcpy)
 INTERFACE_FUNCTION(__sanitizer_internal_memmove)
 INTERFACE_FUNCTION(__sanitizer_internal_memset)
+
+#if SANITIZER_WINDOWS
+INTERFACE_FUNCTION(__sanitizer_override_function)
+INTERFACE_FUNCTION(__sanitizer_override_function_by_addr)
+INTERFACE_FUNCTION(__sanitizer_register_weak_function)
+#endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
deleted file mode 100644
index d0bf8a4556436c..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- sanitizer_coverage_win_dll_thunk.cpp ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a family of thunks that should be statically linked into
-// the DLLs that have instrumentation in order to delegate the calls to the
-// shared runtime that lives in the main binary.
-// See https://github.com/google/sanitizers/issues/209 for the details.
-//===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DLL_THUNK
-#include "sanitizer_win_dll_thunk.h"
-// Sanitizer Coverage interface functions.
-#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "sanitizer_coverage_interface.inc"
-#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp
similarity index 59%
rename from compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp
rename to compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp
index 0bdf0c5aed418d..281944643f216f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp
@@ -1,4 +1,4 @@
-//===-- sanitizer_coverage_win_dynamic_runtime_thunk.cpp ------------------===//
+//===-- sanitizer_coverage_win_runtime_thunk.cpp --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,17 +10,20 @@
 // to interact with Sanitizer Coverage, when it is included in a dll.
 //
 //===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
-#define SANITIZER_IMPORT_INTERFACE 1
-#include "sanitizer_win_defs.h"
+#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
+    defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#  define SANITIZER_IMPORT_INTERFACE 1
+#  include "sanitizer_win_defs.h"
+#  include "sanitizer_win_thunk_interception.h"
 // Define weak alias for all weak functions imported from sanitizer coverage.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
-#include "sanitizer_coverage_interface.inc"
-#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
+#  define INTERFACE_FUNCTION(Name)
+#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
+#  include "sanitizer_coverage_interface.inc"
+#endif  // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
+        // defined(SANITIZER_STATIC_RUNTIME_THUNK)
 
 namespace __sanitizer {
 // Add one, otherwise unused, external symbol to this object file so that the
 // Visual C++ linker includes it and reads the .drective section.
 void ForceWholeArchiveIncludeForSanCov() {}
-}
+}  // namespace __sanitizer
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
deleted file mode 100644
index 55263981705fa6..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- sanitizer_coverage_win_weak_interception.cpp ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This module should be included in Sanitizer Coverage when it implemented as a
-// shared library on Windows (dll), in order to delegate the calls of weak
-// functions to the implementation in the main executable when a strong
-// definition is provided.
-//===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DYNAMIC
-#include "sanitizer_win_weak_interception.h"
-#include "sanitizer_interface_internal.h"
-#include "sancov_flags.h"
-// Check if strong definitions for weak functions are present in the main
-// executable. If that is the case, override dll functions to point to strong
-// implementations.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "sanitizer_coverage_interface.inc"
-#endif // SANITIZER_DYNAMIC
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
deleted file mode 100644
index 1562c161a76260..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//===-- sanitizer_win_dll_thunk.cpp ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file defines a family of thunks that should be statically linked into
-// the DLLs that have instrumentation in order to delegate the calls to the
-// shared runtime that lives in the main binary.
-// See https://github.com/google/sanitizers/issues/209 for the details.
-//===----------------------------------------------------------------------===//
-
-#ifdef SANITIZER_DLL_THUNK
-#include "sanitizer_win_defs.h"
-#include "sanitizer_win_dll_thunk.h"
-#include "interception/interception.h"
-
-extern "C" {
-void *WINAPI GetModuleHandleA(const char *module_name);
-void abort();
-}
-
-namespace __sanitizer {
-uptr dllThunkGetRealAddrOrDie(const char *name) {
-  uptr ret =
-      __interception::InternalGetProcAddress((void *)GetModuleHandleA(0), name);
-  if (!ret)
-    abort();
-  return ret;
-}
-
-int dllThunkIntercept(const char* main_function, uptr dll_function) {
-  uptr wrapper = dllThunkGetRealAddrOrDie(main_function);
-  if (!__interception::OverrideFunction(dll_function, wrapper, 0))
-    abort();
-  return 0;
-}
-
-int dllThunkInterceptWhenPossible(const char* main_function,
-    const char* default_function, uptr dll_function) {
-  uptr wrapper = __interception::InternalGetProcAddress(
-    (void *)GetModuleHandleA(0), main_function);
-  if (!wrapper)
-    wrapper = dllThunkGetRealAddrOrDie(default_function);
-  if (!__interception::OverrideFunction(dll_function, wrapper, 0))
-    abort();
-  return 0;
-}
-} // namespace __sanitizer
-
-// Include Sanitizer Common interface.
-#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "sanitizer_common_interface.inc"
-
-#pragma section(".DLLTH$A", read)
-#pragma section(".DLLTH$Z", read)
-
-typedef void (*DllThunkCB)();
-extern "C" {
-__declspec(allocate(".DLLTH$A")) DllThunkCB __start_dll_thunk;
-__declspec(allocate(".DLLTH$Z")) DllThunkCB __stop_dll_thunk;
-}
-
-// Disable compiler warnings that show up if we declare our own version
-// of a compiler intrinsic (e.g. strlen).
-#pragma warning(disable: 4391)
-#pragma warning(disable: 4392)
-
-extern "C" int __dll_thunk_init() {
-  static bool flag = false;
-  // __dll_thunk_init is expected to be called by only one thread.
-  if (flag) return 0;
-  flag = true;
-
-  for (DllThunkCB *it = &__start_dll_thunk; it < &__stop_dll_thunk; ++it)
-    if (*it)
-      (*it)();
-
-  // In DLLs, the callbacks are expected to return 0,
-  // otherwise CRT initialization fails.
-  return 0;
-}
-
-// We want to call dll_thunk_init before C/C++ initializers / constructors are
-// executed, otherwise functions like memset might be invoked.
-#pragma section(".CRT$XIB", long, read)
-__declspec(allocate(".CRT$XIB")) int (*__dll_thunk_preinit)() =
-    __dll_thunk_init;
-
-static void WINAPI dll_thunk_thread_init(void *mod, unsigned long reason,
-                                         void *reserved) {
-  if (reason == /*DLL_PROCESS_ATTACH=*/1) __dll_thunk_init();
-}
-
-#pragma section(".CRT$XLAB", long, read)
-__declspec(allocate(".CRT$XLAB")) void (WINAPI *__dll_thunk_tls_init)(void *,
-    unsigned long, void *) = dll_thunk_thread_init;
-
-#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
deleted file mode 100644
index 639d91a2edaec4..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
+++ /dev/null
@@ -1,181 +0,0 @@
-//===-- sanitizer_win_dll_thunk.h -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This header provide helper macros to delegate calls to the shared runtime
-// that lives in the main executable. It should be included to dll_thunks that
-// will be linked to the dlls, when the sanitizer is a static library included
-// in the main executable.
-//===----------------------------------------------------------------------===//
-#ifndef SANITIZER_WIN_DLL_THUNK_H
-#define SANITIZER_WIN_DLL_THUNK_H
-#include "sanitizer_internal_defs.h"
-
-namespace __sanitizer {
-uptr dllThunkGetRealAddrOrDie(const char *name);
-
-int dllThunkIntercept(const char* main_function, uptr dll_function);
-
-int dllThunkInterceptWhenPossible(const char* main_function,
-    const char* default_function, uptr dll_function);
-}
-
-extern "C" int __dll_thunk_init();
-
-// ----------------- Function interception helper macros -------------------- //
-// Override dll_function with main_function from main executable.
-#define INTERCEPT_OR_DIE(main_function, dll_function)                          \
-  static int intercept_##dll_function() {                                      \
-    return __sanitizer::dllThunkIntercept(main_function, (__sanitizer::uptr)   \
-        dll_function);                                                         \
-  }                                                                            \
-  __pragma(section(".DLLTH$M", long, read))                                    \
-  __declspec(allocate(".DLLTH$M")) int (*__dll_thunk_##dll_function)() =       \
-    intercept_##dll_function;
-
-// Try to override dll_function with main_function from main executable.
-// If main_function is not present, override dll_function with default_function.
-#define INTERCEPT_WHEN_POSSIBLE(main_function, default_function, dll_function) \
-  static int intercept_##dll_function() {                                      \
-    return __sanitizer::dllThunkInterceptWhenPossible(main_function,           \
-        default_function, (__sanitizer::uptr)dll_function);                    \
-  }                                                                            \
-  __pragma(section(".DLLTH$M", long, read))                                    \
-  __declspec(allocate(".DLLTH$M")) int (*__dll_thunk_##dll_function)() =       \
-    intercept_##dll_function;
-
-// -------------------- Function interception macros ------------------------ //
-// Special case of hooks -- ASan own interface functions.  Those are only called
-// after __asan_init, thus an empty implementation is sufficient.
-#define INTERCEPT_SANITIZER_FUNCTION(name)                                     \
-  extern "C" __declspec(noinline) void name() {                                \
-    volatile int prevent_icf = (__LINE__ << 8) ^ __COUNTER__;                  \
-    static const char function_name[] = #name;                                 \
-    for (const char* ptr = &function_name[0]; *ptr; ++ptr)                     \
-      prevent_icf ^= *ptr;                                                     \
-    (void)prevent_icf;                                                         \
-    __debugbreak();                                                            \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name)
-
-// Special case of hooks -- Weak functions, could be redefined in the main
-// executable, but that is not necessary, so we shouldn't die if we can not find
-// a reference. Instead, when the function is not present in the main executable
-// we consider the default impl provided by asan library.
-#define INTERCEPT_SANITIZER_WEAK_FUNCTION(name)                                \
-  extern "C" __declspec(noinline) void name() {                                \
-    volatile int prevent_icf = (__LINE__ << 8) ^ __COUNTER__;                  \
-    static const char function_name[] = #name;                                 \
-    for (const char* ptr = &function_name[0]; *ptr; ++ptr)                     \
-      prevent_icf ^= *ptr;                                                     \
-    (void)prevent_icf;                                                         \
-    __debugbreak();                                                            \
-  }                                                                            \
-  INTERCEPT_WHEN_POSSIBLE(#name, STRINGIFY(WEAK_EXPORT_NAME(name)), name)
-
-// We can't define our own version of strlen etc. because that would lead to
-// link-time or even type mismatch errors.  Instead, we can declare a function
-// just to be able to get its address.  Me may miss the first few calls to the
-// functions since it can be called before __dll_thunk_init, but that would lead
-// to false negatives in the startup code before user's global initializers,
-// which isn't a big deal.
-#define INTERCEPT_LIBRARY_FUNCTION(name)                                       \
-  extern "C" void name();                                                      \
-  INTERCEPT_OR_DIE(STRINGIFY(WRAP(name)), name)
-
-// Use these macros for functions that could be called before __dll_thunk_init()
-// is executed and don't lead to errors if defined (free, malloc, etc).
-#define INTERCEPT_WRAP_V_V(name)                                               \
-  extern "C" void name() {                                                     \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    fn();                                                                      \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_V_W(name)                                               \
-  extern "C" void name(void *arg) {                                            \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    fn(arg);                                                                   \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_V_WW(name)                                              \
-  extern "C" void name(void *arg1, void *arg2) {                               \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    fn(arg1, arg2);                                                            \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_V_WWW(name)                                             \
-  extern "C" void name(void *arg1, void *arg2, void *arg3) {                   \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    fn(arg1, arg2, arg3);                                                      \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_V(name)                                               \
-  extern "C" void *name() {                                                    \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn();                                                               \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_W(name)                                               \
-  extern "C" void *name(void *arg) {                                           \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg);                                                            \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_WW(name)                                              \
-  extern "C" void *name(void *arg1, void *arg2) {                              \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg1, arg2);                                                     \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_WWW(name)                                             \
-  extern "C" void *name(void *arg1, void *arg2, void *arg3) {                  \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg1, arg2, arg3);                                               \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_WWWW(name)                                            \
-  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4) {      \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg1, arg2, arg3, arg4);                                         \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_WWWWW(name)                                           \
-  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4,        \
-                        void *arg5) {                                          \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg1, arg2, arg3, arg4, arg5);                                   \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#define INTERCEPT_WRAP_W_WWWWWW(name)                                          \
-  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4,        \
-                        void *arg5, void *arg6) {                              \
-    typedef decltype(name) *fntype;                                            \
-    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
-    return fn(arg1, arg2, arg3, arg4, arg5, arg6);                             \
-  }                                                                            \
-  INTERCEPT_OR_DIE(#name, name);
-
-#endif // SANITIZER_WIN_DLL_THUNK_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
deleted file mode 100644
index 87c032c6e61bc9..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- santizer_win_dynamic_runtime_thunk.cpp ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines things that need to be present in the application modules
-// to interact with Sanitizer Common, when it is included in a dll.
-//
-//===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
-#define SANITIZER_IMPORT_INTERFACE 1
-#include "sanitizer_win_defs.h"
-// Define weak alias for all weak functions imported from sanitizer common.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
-#include "sanitizer_common_interface.inc"
-#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
-
-namespace __sanitizer {
-// Add one, otherwise unused, external symbol to this object file so that the
-// Visual C++ linker includes it and reads the .drective section.
-void ForceWholeArchiveIncludeForSanitizerCommon() {}
-}
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
new file mode 100644
index 00000000000000..808cd2f771fe1e
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
@@ -0,0 +1,71 @@
+//===-- sanitizer_win_immortalize.h ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is shared between AddressSanitizer, and interception.
+//
+// Windows-specific thread-safe and pre-CRT global initialization safe
+// infrastructure to create an object whose destructor is never called.
+//===----------------------------------------------------------------------===//
+#if SANITIZER_WINDOWS
+#  pragma once
+// Requires including sanitizer_placement_new.h (which is not allowed to be
+// included in headers).
+
+#  include "sanitizer_win_defs.h"
+// These types are required to satisfy XFG which requires that the names of the
+// types for indirect calls to be correct as well as the name of the original
+// type for any typedefs.
+
+// TODO: There must be a better way to do this
+#  ifndef _WINDOWS_
+typedef void* PVOID;
+typedef int BOOL;
+typedef union _RTL_RUN_ONCE {
+  PVOID ptr;
+} INIT_ONCE, *PINIT_ONCE;
+
+extern "C" {
+__declspec(dllimport) int WINAPI InitOnceExecuteOnce(
+    PINIT_ONCE, BOOL(WINAPI*)(PINIT_ONCE, PVOID, PVOID*), void*, void*);
+}
+#  endif
+
+namespace __sanitizer {
+template <class Ty>
+BOOL WINAPI immortalize_impl(PINIT_ONCE, PVOID storage_ptr, PVOID*) noexcept {
+  // Ty must provide a placement new operator
+  new (storage_ptr) Ty();
+  return 1;
+}
+
+template <class Ty, typename Arg>
+BOOL WINAPI immortalize_impl(PINIT_ONCE, PVOID storage_ptr,
+                             PVOID* param) noexcept {
+  // Ty must provide a placement new operator
+  new (storage_ptr) Ty(*((Arg*)param));
+  return 1;
+}
+
+template <class Ty>
+Ty& immortalize() {  // return a reference to an object that will live forever
+  static INIT_ONCE flag;
+  alignas(Ty) static unsigned char storage[sizeof(Ty)];
+  InitOnceExecuteOnce(&flag, immortalize_impl<Ty>, &storage, nullptr);
+  return reinterpret_cast<Ty&>(storage);
+}
+
+template <class Ty, typename Arg>
+Ty& immortalize(
+    Arg arg) {  // return a reference to an object that will live forever
+  static INIT_ONCE flag;
+  alignas(Ty) static unsigned char storage[sizeof(Ty)];
+  InitOnceExecuteOnce(&flag, immortalize_impl<Ty, Arg>, &storage, &arg);
+  return reinterpret_cast<Ty&>(storage);
+}
+}  // namespace __sanitizer
+#endif  // SANITIZER_WINDOWS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
new file mode 100644
index 00000000000000..75a1545d00d8b5
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
@@ -0,0 +1,156 @@
+//===-- sanitizer_win_interception.cpp --------------------    --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Windows-specific export surface to provide interception for parts of the
+// runtime that are always statically linked, both for overriding user-defined
+// functions as well as registering weak functions that the ASAN runtime should
+// use over defaults.
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_platform.h"
+#if SANITIZER_WINDOWS
+#  include <stddef.h>
+
+#  include "interception/interception.h"
+#  include "sanitizer_addrhashmap.h"
+#  include "sanitizer_common.h"
+#  include "sanitizer_internal_defs.h"
+#  include "sanitizer_placement_new.h"
+#  include "sanitizer_win_immortalize.h"
+#  include "sanitizer_win_interception.h"
+
+using namespace __sanitizer;
+
+extern "C" void *__ImageBase;
+
+namespace __sanitizer {
+
+static uptr GetSanitizerDllExport(const char *export_name) {
+  const uptr function_address =
+      __interception::InternalGetProcAddress(&__ImageBase, export_name);
+  if (function_address == 0) {
+    Report("ERROR: Failed to find sanitizer DLL export '%s'\n", export_name);
+    CHECK("Failed to find sanitizer DLL export" && 0);
+  }
+  return function_address;
+}
+
+struct WeakCallbackList {
+  explicit constexpr WeakCallbackList(RegisterWeakFunctionCallback cb)
+      : callback(cb), next(nullptr) {}
+
+  static void *operator new(size_t size) { return InternalAlloc(size); }
+
+  static void operator delete(void *p) { InternalFree(p); }
+
+  RegisterWeakFunctionCallback callback;
+  WeakCallbackList *next;
+};
+using WeakCallbackMap = AddrHashMap<WeakCallbackList *, 11>;
+
+static WeakCallbackMap *GetWeakCallbackMap() {
+  return &immortalize<WeakCallbackMap>();
+}
+
+void AddRegisterWeakFunctionCallback(uptr export_address,
+                                     RegisterWeakFunctionCallback cb) {
+  WeakCallbackMap::Handle h_find_or_create(GetWeakCallbackMap(), export_address,
+                                           false, true);
+  CHECK(h_find_or_create.exists());
+  if (h_find_or_create.created()) {
+    *h_find_or_create = new WeakCallbackList(cb);
+  } else {
+    (*h_find_or_create)->next = new WeakCallbackList(cb);
+  }
+}
+
+static void RunWeakFunctionCallbacks(uptr export_address) {
+  WeakCallbackMap::Handle h_find(GetWeakCallbackMap(), export_address, false,
+                                 false);
+  if (!h_find.exists()) {
+    return;
+  }
+
+  WeakCallbackList *list = *h_find;
+  do {
+    list->callback();
+  } while ((list = list->next));
+}
+
+}  // namespace __sanitizer
+
+extern "C" __declspec(dllexport) bool __cdecl __sanitizer_override_function(
+    const char *export_name, const uptr user_function,
+    uptr *const old_user_function) {
+  CHECK(export_name);
+  CHECK(user_function);
+
+  const uptr sanitizer_function = GetSanitizerDllExport(export_name);
+
+  const bool function_overridden = __interception::OverrideFunction(
+      user_function, sanitizer_function, old_user_function);
+  if (!function_overridden) {
+    Report(
+        "ERROR: Failed to override local function at '%p' with sanitizer "
+        "function '%s'\n",
+        user_function, export_name);
+    CHECK("Failed to replace local function with sanitizer version." && 0);
+  }
+
+  return function_overridden;
+}
+
+extern "C"
+    __declspec(dllexport) bool __cdecl __sanitizer_override_function_by_addr(
+        const uptr source_function, const uptr target_function,
+        uptr *const old_target_function) {
+  CHECK(source_function);
+  CHECK(target_function);
+
+  const bool function_overridden = __interception::OverrideFunction(
+      target_function, source_function, old_target_function);
+  if (!function_overridden) {
+    Report(
+        "ERROR: Failed to override function at '%p' with function at "
+        "'%p'\n",
+        target_function, source_function);
+    CHECK("Failed to apply function override." && 0);
+  }
+
+  return function_overridden;
+}
+
+extern "C"
+    __declspec(dllexport) bool __cdecl __sanitizer_register_weak_function(
+        const char *export_name, const uptr user_function,
+        uptr *const old_user_function) {
+  CHECK(export_name);
+  CHECK(user_function);
+
+  const uptr sanitizer_function = GetSanitizerDllExport(export_name);
+
+  const bool function_overridden = __interception::OverrideFunction(
+      sanitizer_function, user_function, old_user_function);
+  if (!function_overridden) {
+    Report(
+        "ERROR: Failed to register local function at '%p' to be used in "
+        "place of sanitizer function '%s'\n.",
+        user_function, export_name);
+    CHECK("Failed to register weak function." && 0);
+  }
+
+  // Note that thread-safety of RunWeakFunctionCallbacks in InitializeFlags
+  // depends on __sanitizer_register_weak_functions being called during the
+  // loader lock.
+  RunWeakFunctionCallbacks(sanitizer_function);
+
+  return function_overridden;
+}
+
+#endif  // SANITIZER_WINDOWS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
new file mode 100644
index 00000000000000..70ae3d6bf31f2a
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
@@ -0,0 +1,32 @@
+//===-- sanitizer_win_interception.h ----------------------    --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Windows-specific export surface to provide interception for parts of the
+// runtime that are always statically linked, both for overriding user-defined
+// functions as well as registering weak functions that the ASAN runtime should
+// use over defaults.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_WIN_INTERCEPTION_H
+#define SANITIZER_WIN_INTERCEPTION_H
+
+#include "sanitizer_platform.h"
+#if SANITIZER_WINDOWS
+
+#  include "sanitizer_common.h"
+#  include "sanitizer_internal_defs.h"
+
+namespace __sanitizer {
+using RegisterWeakFunctionCallback = void (*)();
+void AddRegisterWeakFunctionCallback(uptr export_address,
+                                     RegisterWeakFunctionCallback cb);
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_WINDOWS
+#endif  // SANITIZER_WIN_INTERCEPTION_H
\ No newline at end of file
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
new file mode 100644
index 00000000000000..13db8869abadd5
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
@@ -0,0 +1,110 @@
+//===-- sanitizer_win_thunk_interception.cpp -----------------------  -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things that need to be present in the application modules
+// to interact with sanitizer DLL correctly and cannot be implemented using the
+// default "import library" generated when linking the DLL.
+//
+// This includes the common infrastructure required to intercept local functions
+// that must be replaced with sanitizer-aware versions, as well as the
+// registration of weak functions with the sanitizer DLL. With this in-place,
+// other sanitizer components can simply write to the .INTR and .WEAK sections.
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(SANITIZER_STATIC_RUNTIME_THUNK) || \
+    defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
+#  include "sanitizer_win_thunk_interception.h"
+
+extern "C" void abort();
+
+namespace __sanitizer {
+
+int override_function(const char *export_name, const uptr user_function) {
+  if (!__sanitizer_override_function(export_name, user_function)) {
+    abort();
+  }
+
+  return 0;
+}
+
+int register_weak(const char *export_name, const uptr user_function) {
+  if (!__sanitizer_register_weak_function(export_name, user_function)) {
+    abort();
+  }
+
+  return 0;
+}
+
+void initialize_thunks(const sanitizer_thunk *first,
+                       const sanitizer_thunk *last) {
+  for (const sanitizer_thunk *it = first; it < last; ++it) {
+    if (*it) {
+      (*it)();
+    }
+  }
+}
+}  // namespace __sanitizer
+
+#  define INTERFACE_FUNCTION(Name)
+#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
+#  include "sanitizer_common_interface.inc"
+
+#  pragma section(".INTR$A", read)  // intercept begin
+#  pragma section(".INTR$Z", read)  // intercept end
+#  pragma section(".WEAK$A", read)  // weak begin
+#  pragma section(".WEAK$Z", read)  // weak end
+
+extern "C" {
+__declspec(allocate(
+    ".INTR$A")) sanitizer_thunk __sanitizer_intercept_thunk_begin;
+__declspec(allocate(".INTR$Z")) sanitizer_thunk __sanitizer_intercept_thunk_end;
+
+__declspec(allocate(
+    ".WEAK$A")) sanitizer_thunk __sanitizer_register_weak_thunk_begin;
+__declspec(allocate(
+    ".WEAK$Z")) sanitizer_thunk __sanitizer_register_weak_thunk_end;
+}
+
+extern "C" int __sanitizer_thunk_init() {
+  // __sanitizer_static_thunk_init is expected to be called by only one thread.
+  static bool flag = false;
+  if (flag) {
+    return 0;
+  }
+  flag = true;
+
+  __sanitizer::initialize_thunks(&__sanitizer_intercept_thunk_begin,
+                                 &__sanitizer_intercept_thunk_end);
+  __sanitizer::initialize_thunks(&__sanitizer_register_weak_thunk_begin,
+                                 &__sanitizer_register_weak_thunk_end);
+
+  // In DLLs, the callbacks are expected to return 0,
+  // otherwise CRT initialization fails.
+  return 0;
+}
+
+// We want to call dll_thunk_init before C/C++ initializers / constructors are
+// executed, otherwise functions like memset might be invoked.
+#  pragma section(".CRT$XIB", long, read)
+__declspec(allocate(".CRT$XIB")) int (*__sanitizer_thunk_init_ptr)() =
+    __sanitizer_thunk_init;
+
+static void WINAPI sanitizer_thunk_thread_init(void *mod, unsigned long reason,
+                                               void *reserved) {
+  if (reason == /*DLL_PROCESS_ATTACH=*/1)
+    __sanitizer_thunk_init();
+}
+
+#  pragma section(".CRT$XLAB", long, read)
+__declspec(allocate(".CRT$XLAB")) void(
+    WINAPI *__sanitizer_thunk_thread_init_ptr)(void *, unsigned long, void *) =
+    sanitizer_thunk_thread_init;
+
+#endif  // defined(SANITIZER_STATIC_RUNTIME_THUNK) ||
+        // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
\ No newline at end of file
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
new file mode 100644
index 00000000000000..70177d68aa8e65
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
@@ -0,0 +1,81 @@
+//===-- sanitizer_win_thunk_interception.h -------------------------  -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This header provide helper macros and functions to delegate calls to the
+// shared runtime that lives in the sanitizer DLL.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_WIN_THUNK_INTERCEPTION_H
+#define SANITIZER_WIN_THUNK_INTERCEPTION_H
+#include <stdint.h>
+
+#include "sanitizer_internal_defs.h"
+
+extern "C" {
+__declspec(dllimport) bool __cdecl __sanitizer_override_function(
+    const char *export_name, __sanitizer::uptr user_function,
+    __sanitizer::uptr *old_function = nullptr);
+__declspec(dllimport) bool __cdecl __sanitizer_override_function_by_addr(
+    __sanitizer::uptr source_function, __sanitizer::uptr target_function,
+    __sanitizer::uptr *old_target_function = nullptr);
+__declspec(dllimport) bool __cdecl __sanitizer_register_weak_function(
+    const char *export_name, __sanitizer::uptr user_function,
+    __sanitizer::uptr *old_function = nullptr);
+}
+
+using sanitizer_thunk = int (*)();
+
+namespace __sanitizer {
+int override_function(const char *export_name, uptr user_function);
+int register_weak(const char *export_name, uptr user_function);
+void initialize_thunks(const sanitizer_thunk *begin,
+                       const sanitizer_thunk *end);
+}  // namespace __sanitizer
+
+// -------------------- Function interception macros ------------------------ //
+// We can't define our own version of strlen etc. because that would lead to
+// link-time or even type mismatch errors.  Instead, we can declare a function
+// just to be able to get its address.  Me may miss the first few calls to the
+// functions since it can be called before __dll_thunk_init, but that would lead
+// to false negatives in the startup code before user's global initializers,
+// which isn't a big deal.
+// Use .INTR segment to register function pointers that are iterated over during
+// startup that will replace local_function with sanitizer_export.
+
+#define INTERCEPT_LIBRARY_FUNCTION(local_function, sanitizer_export)   \
+  extern "C" void local_function();                                    \
+  static int intercept_##local_function() {                            \
+    return __sanitizer::override_function(                             \
+        sanitizer_export,                                              \
+        reinterpret_cast<__sanitizer::uptr>(local_function));          \
+  }                                                                    \
+  __pragma(section(".INTR$M", long, read)) __declspec(allocate(        \
+      ".INTR$M")) int (*__sanitizer_static_thunk_##local_function)() = \
+      intercept_##local_function;
+
+// ------------------ Weak symbol registration macros ---------------------- //
+// Use .WEAK segment to register function pointers that are iterated over during
+// startup that will replace sanitizer_export with local_function
+
+#define REGISTER_WEAK_FUNCTION(local_function)                           \
+  extern "C" void local_function();                                      \
+  extern "C" void WEAK_EXPORT_NAME(local_function)();                    \
+  WIN_WEAK_IMPORT_DEF(local_function)                                    \
+  __attribute__((optnone)) static int register_weak_##local_function() { \
+    if ((uintptr_t) & local_function != (uintptr_t) &                    \
+        WEAK_EXPORT_NAME(local_function)) {                              \
+      return __sanitizer::register_weak(                                 \
+          SANITIZER_STRINGIFY(WEAK_EXPORT_NAME(local_function)),         \
+          reinterpret_cast<__sanitizer::uptr>(local_function));          \
+    }                                                                    \
+    return 0;                                                            \
+  }                                                                      \
+  __pragma(section(".WEAK$M", long, read)) __declspec(allocate(          \
+      ".WEAK$M")) int (*__sanitizer_register_weak_##local_function)() =  \
+      register_weak_##local_function;
+
+#endif  // SANITIZER_WIN_STATIC_RUNTIME_THUNK_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
deleted file mode 100644
index b14bbf76d9a765..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-//===-- sanitizer_win_weak_interception.cpp -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This module should be included in the sanitizer when it is implemented as a
-// shared library on Windows (dll), in order to delegate the calls of weak
-// functions to the implementation in the main executable when a strong
-// definition is provided.
-//===----------------------------------------------------------------------===//
-
-#include "sanitizer_platform.h"
-#if SANITIZER_WINDOWS && SANITIZER_DYNAMIC
-#include "sanitizer_win_weak_interception.h"
-#include "sanitizer_allocator_interface.h"
-#include "sanitizer_interface_internal.h"
-#include "sanitizer_win_defs.h"
-#include "interception/interception.h"
-
-extern "C" {
-void *WINAPI GetModuleHandleA(const char *module_name);
-void abort();
-}
-
-namespace __sanitizer {
-// Try to get a pointer to real_function in the main module and override
-// dll_function with that pointer. If the function isn't found, nothing changes.
-int interceptWhenPossible(uptr dll_function, const char *real_function) {
-  uptr real = __interception::InternalGetProcAddress(
-      (void *)GetModuleHandleA(0), real_function);
-  if (real && !__interception::OverrideFunction((uptr)dll_function, real, 0))
-    abort();
-  return 0;
-}
-} // namespace __sanitizer
-
-// Declare weak hooks.
-extern "C" {
-void __sanitizer_on_print(const char *str);
-void __sanitizer_weak_hook_memcmp(uptr called_pc, const void *s1,
-                                  const void *s2, uptr n, int result);
-void __sanitizer_weak_hook_strcmp(uptr called_pc, const char *s1,
-                                  const char *s2, int result);
-void __sanitizer_weak_hook_strncmp(uptr called_pc, const char *s1,
-                                   const char *s2, uptr n, int result);
-void __sanitizer_weak_hook_strstr(uptr called_pc, const char *s1,
-                                  const char *s2, char *result);
-}
-
-// Include Sanitizer Common interface.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "sanitizer_common_interface.inc"
-
-#pragma section(".WEAK$A", read)
-#pragma section(".WEAK$Z", read)
-
-typedef void (*InterceptCB)();
-extern "C" {
-__declspec(allocate(".WEAK$A")) InterceptCB __start_weak_list;
-__declspec(allocate(".WEAK$Z")) InterceptCB __stop_weak_list;
-}
-
-static int weak_intercept_init() {
-  static bool flag = false;
-  // weak_interception_init is expected to be called by only one thread.
-  if (flag) return 0;
-  flag = true;
-
-  for (InterceptCB *it = &__start_weak_list; it < &__stop_weak_list; ++it)
-    if (*it)
-      (*it)();
-
-  // In DLLs, the callbacks are expected to return 0,
-  // otherwise CRT initialization fails.
-  return 0;
-}
-
-#pragma section(".CRT$XIB", long, read)
-__declspec(allocate(".CRT$XIB")) int (*__weak_intercept_preinit)() =
-    weak_intercept_init;
-
-static void WINAPI weak_intercept_thread_init(void *mod, unsigned long reason,
-                                              void *reserved) {
-  if (reason == /*DLL_PROCESS_ATTACH=*/1) weak_intercept_init();
-}
-
-#pragma section(".CRT$XLAB", long, read)
-__declspec(allocate(".CRT$XLAB")) void(WINAPI *__weak_intercept_tls_init)(
-    void *, unsigned long, void *) = weak_intercept_thread_init;
-
-#endif // SANITIZER_WINDOWS && SANITIZER_DYNAMIC
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
deleted file mode 100644
index 5e4d8b8def3e7d..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- sanitizer_win_weak_interception.h ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This header provide helper macros to delegate calls of weak functions to the
-// implementation in the main executable when a strong definition is present.
-//===----------------------------------------------------------------------===//
-#ifndef SANITIZER_WIN_WEAK_INTERCEPTION_H
-#define SANITIZER_WIN_WEAK_INTERCEPTION_H
-#include "sanitizer_internal_defs.h"
-
-namespace __sanitizer {
-int interceptWhenPossible(uptr dll_function, const char *real_function);
-}
-
-// ----------------- Function interception helper macros -------------------- //
-// Weak functions, could be redefined in the main executable, but that is not
-// necessary, so we shouldn't die if we can not find a reference.
-#define INTERCEPT_WEAK(Name) interceptWhenPossible((uptr) Name, #Name);
-
-#define INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)                                \
-  static int intercept_##Name() {                                              \
-    return __sanitizer::interceptWhenPossible((__sanitizer::uptr) Name, #Name);\
-  }                                                                            \
-  __pragma(section(".WEAK$M", long, read))                                     \
-  __declspec(allocate(".WEAK$M")) int (*__weak_intercept_##Name)() =           \
-      intercept_##Name;
-
-#endif // SANITIZER_WIN_WEAK_INTERCEPTION_H
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
index db0b33f1276ef2..5d45a53d02dbd3 100644
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -159,33 +159,12 @@ else()
     CFLAGS ${UBSAN_CXXFLAGS})
 
   if (WIN32)
-    add_compiler_rt_object_libraries(UbsanWeakInterception
+    set(RUNTIME_THUNK_CFLAGS -DSANITIZER_DYNAMIC_RUNTIME_THUNK -DSANITIZER_STATIC_RUNTIME_THUNK)
+    add_compiler_rt_object_libraries(UbsanRuntimeThunk
       ${SANITIZER_COMMON_SUPPORTED_OS}
       ARCHS ${UBSAN_SUPPORTED_ARCH}
       SOURCES
-        ubsan_win_weak_interception.cpp
-      CFLAGS ${UBSAN_CFLAGS} -DSANITIZER_DYNAMIC
-      DEFS ${UBSAN_COMMON_DEFINITIONS})
-
-    add_compiler_rt_object_libraries(UbsanDllThunk
-      ${SANITIZER_COMMON_SUPPORTED_OS}
-      ARCHS ${UBSAN_SUPPORTED_ARCH}
-      SOURCES
-        ubsan_win_dll_thunk.cpp
-      CFLAGS ${UBSAN_CFLAGS} -DSANITIZER_DLL_THUNK
-      DEFS ${UBSAN_COMMON_DEFINITIONS})
-
-    set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
-    if(MSVC)
-      list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
-    elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
-      list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
-    endif()
-    add_compiler_rt_object_libraries(UbsanDynamicRuntimeThunk
-      ${SANITIZER_COMMON_SUPPORTED_OS}
-      ARCHS ${UBSAN_SUPPORTED_ARCH}
-      SOURCES
-        ubsan_win_dynamic_runtime_thunk.cpp
+        ubsan_win_runtime_thunk.cpp
       CFLAGS ${UBSAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
       DEFS ${UBSAN_COMMON_DEFINITIONS})
   endif()
diff --git a/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp b/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
deleted file mode 100644
index 5ac7fc3e08e4c7..00000000000000
--- a/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- ubsan_win_dll_thunk.cpp -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a family of thunks that should be statically linked into
-// the DLLs that have instrumentation in order to delegate the calls to the
-// shared runtime that lives in the main binary.
-// See https://github.com/google/sanitizers/issues/209 for the details.
-//===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DLL_THUNK
-#include "sanitizer_common/sanitizer_win_dll_thunk.h"
-// Ubsan interface functions.
-#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "ubsan_interface.inc"
-#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp
similarity index 62%
rename from compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp
rename to compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp
index 00722b4033a53f..5ca7d6f385cf27 100644
--- a/compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp
@@ -1,4 +1,4 @@
-//===-- ubsan_win_dynamic_runtime_thunk.cpp -------------------------------===//
+//===-- ubsan_win_runtime_thunk.cpp -----------------------------        --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +10,14 @@
 // to interact with Ubsan, when it is included in a dll.
 //
 //===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
+#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||                                \
+    defined(SANITIZER_STATIC_RUNTIME_THUNK)
 #define SANITIZER_IMPORT_INTERFACE 1
 #include "sanitizer_common/sanitizer_win_defs.h"
+#include "sanitizer_common/sanitizer_win_thunk_interception.h"
 // Define weak alias for all weak functions imported from ubsan.
 #define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
 #include "ubsan_interface.inc"
-#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
+#endif // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
+       // defined(SANITIZER_STATIC_RUNTIME_THUNK)
diff --git a/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp b/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp
deleted file mode 100644
index 01db0c0ce78abe..00000000000000
--- a/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- ubsan_win_weak_interception.cpp -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This module should be included in Ubsan when it is implemented as a shared
-// library on Windows (dll), in order to delegate the calls of weak functions to
-// the implementation in the main executable when a strong definition is
-// provided.
-//===----------------------------------------------------------------------===//
-#ifdef SANITIZER_DYNAMIC
-#include "sanitizer_common/sanitizer_win_weak_interception.h"
-#include "ubsan_flags.h"
-#include "ubsan_monitor.h"
-// Check if strong definitions for weak functions are present in the main
-// executable. If that is the case, override dll functions to point to strong
-// implementations.
-#define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
-#include "ubsan_interface.inc"
-#endif // SANITIZER_DYNAMIC
diff --git a/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp b/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
index 2cedbc722c4635..59dca32672901a 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
@@ -35,6 +35,9 @@
 // RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc     \
 // RUN:  | grep -e "INTERFACE_\(WEAK_\)\?FUNCTION"                                \
 // RUN:  | grep -v "__sanitizer_weak_hook"                                        \
+// RUN:  | grep -v "__sanitizer_override_function"                                \
+// RUN:  | grep -v "__sanitizer_override_function_by_addr"                        \
+// RUN:  | grep -v "__sanitizer_register_weak_function"                           \
 // RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports
 //
 // RUN: cat %t.imports | sort | uniq > %t.imports-sorted
diff --git a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
index ce1255c9578317..2d729497548d90 100644
--- a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
@@ -21,6 +21,9 @@
 // RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc      \
 // RUN:  | grep -e "INTERFACE_\(WEAK_\)\?FUNCTION"                                 \
 // RUN:  | grep -v "__sanitizer_weak_hook"                                         \
+// RUN:  | grep -v "__sanitizer_override_function"                                 \
+// RUN:  | grep -v "__sanitizer_override_function_by_addr"                         \
+// RUN:  | grep -v "__sanitizer_register_weak_function"                            \
 // RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports
 //
 // RUN: cat %t.imports | sort | uniq > %t.imports-sorted
diff --git a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
index e288b40fac47a3..71c45e7e889a22 100644
--- a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
@@ -9,13 +9,13 @@ int main() {
   free(x);
   // CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-3]]
+  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-3]]
   // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
   // CHECK-LABEL: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-8]]
   // CHECK-LABEL: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-12]]
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
index 11e8c9975cf3bf..297218bf8e99f1 100644
--- a/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
@@ -5,9 +5,6 @@
 // FIXME: merge this with the common free_hook_realloc test when we can run
 // common tests on Windows.
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 #include <stdlib.h>
 #include <io.h>
 #include <sanitizer/allocator_interface.h>
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
index 7ea95d2b2184a0..e5de2269ffee04 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-2] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
index 1495632456e081..6007345755d88e 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-2] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
index d1eac7e55f6010..59a944c75b60db 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-3] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
index 96fae6b1d60392..175bdefa7c995d 100644
--- a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
@@ -1,10 +1,9 @@
 // Just make sure we can link an implib into another DLL
 // This used to fail between r212699 and r212814.
 // RUN: %clang_cl_asan -DCONFIG=1 %s -c -Fo%t.1.obj
-// RUN: lld-link /nologo /DLL /OUT:%t.1.dll %t.1.obj %asan_dll_thunk
+// RUN: lld-link /nologo /DLL /OUT:%t.1.dll %t.1.obj %asan_lib %asan_thunk
 // RUN: %clang_cl_asan -DCONFIG=2 %s -c -Fo%t.2.obj
-// RUN: lld-link /nologo /DLL /OUT:%t.2.dll %t.2.obj %t.1.lib %asan_dll_thunk
-// REQUIRES: asan-static-runtime
+// RUN: lld-link /nologo /DLL /OUT:%t.2.dll %t.2.obj %t.1.lib %asan_lib %asan_thunk
 // REQUIRES: lld-available
 
 #if CONFIG==1
diff --git a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
index 788488dbb8ed82..f0c3deabbcf970 100644
--- a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
@@ -3,8 +3,7 @@
 // from the DLL.  We simulate the large function with
 // -mllvm -asan-instrumentation-with-call-threshold=0.
 // RUN: %clang_cl_asan %s -c -Fo%t.obj -mllvm -asan-instrumentation-with-call-threshold=0
-// RUN: lld-link /nologo /DLL /OUT:%t.dll %t.obj %asan_dll_thunk
-// REQUIRES: asan-static-runtime
+// RUN: lld-link /nologo /DLL /OUT:%t.dll %t.obj  %asan_lib %asan_thunk
 // REQUIRES: lld-available
 
 void f(long* foo, long* bar) {
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
index ebde5f159ae383..f1fd139c582511 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-2] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
index 281efed5d30740..ea674f53def793 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-2] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
index 6ff2217b11a257..7d9c41ef0f4621 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-3] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
index be99c89e7083ef..05437abc07c829 100644
--- a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
@@ -17,6 +17,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc}}
-  // CHECK: {{ #[1-2] .* main .*symbols_path.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*symbols_path.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp b/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
index 0eb1e9ee91b0a7..00428b809fccd7 100644
--- a/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
@@ -4,7 +4,7 @@
 
 // RUN: rm -f %t.pdb
 // RUN: %clangxx_asan -c -O2 %s -o %t.obj
-// RUN: lld-link /nologo /OUT:%t.exe %t.obj %asan_lib %asan_cxx_lib
+// RUN: lld-link /nologo /OUT:%t.exe %t.obj -defaultlib:libcmt -nodefaultlib:msvcrt -defaultlib:oldnames %asan_static_runtime_thunk %asan_lib
 // RUN: not %run %t.exe 2>&1 | FileCheck %s
 // REQUIRES: lld-available
 
diff --git a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
index 4c32c63c38fa1f..35947b3253857c 100644
--- a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
@@ -15,9 +15,9 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
+  // CHECK: {{ #[1-3] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
+  // CHECK: {{ #[1-3] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/debug_double_free.cpp b/compiler-rt/test/asan/TestCases/debug_double_free.cpp
index de5ac7b0c8d5cd..8a2ce40bc561f6 100644
--- a/compiler-rt/test/asan/TestCases/debug_double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/debug_double_free.cpp
@@ -4,9 +4,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 // If we use %p with MSVC, it comes out all upper case. Use %08x to get
 // lowercase hex.
 #ifdef _MSC_VER
diff --git a/compiler-rt/test/asan/TestCases/debug_report.cpp b/compiler-rt/test/asan/TestCases/debug_report.cpp
index 617b7ee91e18d7..855642bdc0d3bb 100644
--- a/compiler-rt/test/asan/TestCases/debug_report.cpp
+++ b/compiler-rt/test/asan/TestCases/debug_report.cpp
@@ -6,9 +6,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 int main() {
   // Disable stderr buffering. Needed on Windows.
   setvbuf(stderr, NULL, _IONBF, 0);
diff --git a/compiler-rt/test/asan/TestCases/default_options.cpp b/compiler-rt/test/asan/TestCases/default_options.cpp
index 526dab6450e9bd..845e8a5f1793e4 100644
--- a/compiler-rt/test/asan/TestCases/default_options.cpp
+++ b/compiler-rt/test/asan/TestCases/default_options.cpp
@@ -1,11 +1,7 @@
 // RUN: %clangxx_asan -O2 %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 const char *kAsanDefaultOptions = "verbosity=1 help=1";
-
 // Required for dyld macOS 12.0+
 #if (__APPLE__)
 __attribute__((weak))
diff --git a/compiler-rt/test/asan/TestCases/on_error_callback.cpp b/compiler-rt/test/asan/TestCases/on_error_callback.cpp
index f65a8f1abe8310..c38a36f0e33bda 100644
--- a/compiler-rt/test/asan/TestCases/on_error_callback.cpp
+++ b/compiler-rt/test/asan/TestCases/on_error_callback.cpp
@@ -1,8 +1,5 @@
 // RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/compiler-rt/test/asan/TestCases/report_error_summary.cpp b/compiler-rt/test/asan/TestCases/report_error_summary.cpp
index d565d2add77934..9e024e35bed864 100644
--- a/compiler-rt/test/asan/TestCases/report_error_summary.cpp
+++ b/compiler-rt/test/asan/TestCases/report_error_summary.cpp
@@ -1,8 +1,5 @@
 // RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
-// FIXME: Doesn't work with DLLs
-// XFAIL: win32-dynamic-asan
-
 #include <stdio.h>
 
 // Required for ld64 macOS 12.0+
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 83b3cbe789cacc..05ed7e8dd294e3 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -130,6 +130,11 @@ def build_invocation(compile_flags, with_lto=False):
             config.compiler_rt_libdir,
             "libclang_rt.asan_{}_dynamic.dylib".format(config.apple_platform),
         )
+    elif config.host_os == "Windows":
+        shared_libasan_path = os.path.join(
+            config.compiler_rt_libdir,
+            "clang_rt.asan_dynamic-{}.lib".format(config.target_suffix),
+        )
     else:
         lit_config.warning(
             "%shared_libasan substitution not set but dynamic ASan is available."
@@ -178,8 +183,22 @@ def build_invocation(compile_flags, with_lto=False):
         base_lib = os.path.join(
             config.compiler_rt_libdir, "clang_rt.asan%%s%s.lib" % config.target_suffix
         )
-        config.substitutions.append(("%asan_lib", base_lib % ""))
+        config.substitutions.append(("%asan_lib", base_lib % "_dynamic"))
+        if config.asan_dynamic:
+            config.substitutions.append(
+                ("%asan_thunk", base_lib % "_dynamic_runtime_thunk")
+            )
+        else:
+            config.substitutions.append(
+                ("%asan_thunk", base_lib % "_static_runtime_thunk")
+            )
         config.substitutions.append(("%asan_cxx_lib", base_lib % "_cxx"))
+        config.substitutions.append(
+            ("%asan_dynamic_runtime_thunk", base_lib % "_dynamic_runtime_thunk")
+        )
+        config.substitutions.append(
+            ("%asan_static_runtime_thunk", base_lib % "_static_runtime_thunk")
+        )
         config.substitutions.append(("%asan_dll_thunk", base_lib % "_dll_thunk"))
     else:
         # To make some of these tests work on MinGW target without changing their
@@ -262,7 +281,7 @@ def build_invocation(compile_flags, with_lto=False):
 
 # Add the RT libdir to PATH directly so that we can successfully run the gtest
 # binary to list its tests.
-if config.host_os == "Windows" and config.asan_dynamic:
+if config.host_os == "Windows":
     os.environ["PATH"] = os.path.pathsep.join(
         [config.compiler_rt_libdir, os.environ.get("PATH", "")]
     )

From ce2927a396c506cbf06ea39eff8f226fd8d46b94 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 29 May 2024 21:45:01 -0300
Subject: [PATCH 209/230] [clang] fix printing of canonical template template
 parameters take 2 (#93448)

Since they can also occur as the template name of
template specializations, handle them from TemplateName printing instead
of TemplateArgument.
---
 clang/lib/AST/TemplateBase.cpp              | 11 +----------
 clang/lib/AST/TemplateName.cpp              | 14 ++++++++++++++
 clang/test/SemaTemplate/deduction-guide.cpp | 10 +++++-----
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 6d3c843cfd29e0..46f7b79b272ef3 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -544,16 +544,7 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
     break;
 
   case Template: {
-    TemplateName TN = getAsTemplate();
-    if (const auto *TD = TN.getAsTemplateDecl();
-        TD && TD->getDeclName().isEmpty()) {
-      assert(isa<TemplateTemplateParmDecl>(TD) &&
-             "Unexpected anonymous template");
-      const auto *TTP = cast<TemplateTemplateParmDecl>(TD);
-      Out << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex();
-    } else {
-      TN.print(Out, Policy);
-    }
+    getAsTemplate().print(Out, Policy);
     break;
   }
 
diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp
index 3aae998eceeb05..3dbdad92813f6f 100644
--- a/clang/lib/AST/TemplateName.cpp
+++ b/clang/lib/AST/TemplateName.cpp
@@ -292,6 +292,14 @@ void TemplateName::Profile(llvm::FoldingSetNodeID &ID) {
 
 void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy,
                          Qualified Qual) const {
+  auto handleAnonymousTTP = [](TemplateDecl *TD, raw_ostream &OS) {
+    if (TemplateTemplateParmDecl *TTP = dyn_cast<TemplateTemplateParmDecl>(TD);
+        TTP && TTP->getIdentifier() == nullptr) {
+      OS << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex();
+      return true;
+    }
+    return false;
+  };
   if (NameKind Kind = getKind();
       Kind == TemplateName::Template || Kind == TemplateName::UsingTemplate) {
     // After `namespace ns { using std::vector }`, what is the fully-qualified
@@ -304,6 +312,8 @@ void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy,
     // names more often than to export them, thus using the original name is
     // most useful in this case.
     TemplateDecl *Template = getAsTemplateDecl();
+    if (handleAnonymousTTP(Template, OS))
+      return;
     if (Qual == Qualified::None)
       OS << *Template;
     else
@@ -320,6 +330,10 @@ void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy,
            Underlying.getKind() == TemplateName::UsingTemplate);
 
     TemplateDecl *UTD = Underlying.getAsTemplateDecl();
+
+    if (handleAnonymousTTP(UTD, OS))
+      return;
+
     if (IdentifierInfo *II = UTD->getIdentifier();
         Policy.CleanUglifiedParameters && II &&
         isa<TemplateTemplateParmDecl>(UTD))
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 96b4cd9622a24f..100b580fe9f02d 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -315,19 +315,19 @@ namespace TTP {
 // CHECK-NEXT:  |-TemplateTypeParmDecl {{.+}} class depth 0 index 0 T{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl {{.+}} depth 0 index 1 TT{{$}}
 // CHECK-NEXT:  | `-TemplateTypeParmDecl {{.+}} class depth 1 index 0{{$}}
-// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (<T>) -> B<T>'{{$}}
-// CHECK-NEXT:  | `-ParmVarDecl {{.+}} '<T>'{{$}}
+// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>'{{$}}
+// CHECK-NEXT:  | `-ParmVarDecl {{.+}} 'template-parameter-0-1<T>'{{$}}
 // CHECK-NEXT:  `-CXXDeductionGuideDecl {{.+}} 'auto (A<int>) -> TTP::B<int>'
 // CHECK-NEXT:    |-TemplateArgument type 'int'
 // CHECK-NEXT:    | `-BuiltinType {{.+}} 'int'{{$}}
 // CHECK-NEXT:    |-TemplateArgument template 'TTP::A'{{$}}
 // CHECK-NEXT:    | `-ClassTemplateDecl {{.+}} A{{$}}
 // CHECK-NEXT:    `-ParmVarDecl {{.+}} 'A<int>':'TTP::A<int>'{{$}}
-// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (<T>) -> B<T>' dependent trailing_return cdecl{{$}}
+// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>' dependent trailing_return cdecl{{$}}
 // CHECK-NEXT:  |-InjectedClassNameType {{.+}} 'B<T>' dependent{{$}}
 // CHECK-NEXT:  | `-CXXRecord {{.+}} 'B'{{$}}
-// CHECK-NEXT:  `-ElaboratedType {{.+}} '<T>' sugar dependent{{$}}
-// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} '<T>' dependent {{$}}
+// CHECK-NEXT:  `-ElaboratedType {{.+}} 'template-parameter-0-1<T>' sugar dependent{{$}}
+// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} 'template-parameter-0-1<T>' dependent template-parameter-0-1{{$}}
 // CHECK-NEXT:      `-TemplateArgument type 'T':'type-parameter-0-0'{{$}}
 // CHECK-NEXT:        `-TemplateTypeParmType {{.+}} 'T' dependent depth 0 index 0{{$}}
 // CHECK-NEXT:          `-TemplateTypeParm {{.+}} 'T'{{$}}

From 10436aede261db5260e0e11a4aa66565ba975848 Mon Sep 17 00:00:00 2001
From: Charlie Barto <chbarto@microsoft.com>
Date: Wed, 29 May 2024 17:52:54 -0700
Subject: [PATCH 210/230] Revert "[asan][windows] Eliminate the static asan
 runtime on windows (#81677)"

This reverts commit 246234ac70faa1e3281a2bb83dfc4dd206a7d59c.
---
 clang/lib/Driver/SanitizerArgs.cpp            |  14 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |  26 +-
 clang/test/Driver/cl-link.c                   |  10 +-
 compiler-rt/CMakeLists.txt                    |   8 +-
 compiler-rt/lib/asan/CMakeLists.txt           | 159 ++++++------
 compiler-rt/lib/asan/asan_flags.cpp           |  96 +-------
 compiler-rt/lib/asan/asan_globals_win.cpp     |   4 +-
 compiler-rt/lib/asan/asan_malloc_win.cpp      |  97 ++++----
 .../lib/asan/asan_malloc_win_thunk.cpp        | 229 ------------------
 .../asan/asan_win_common_runtime_thunk.cpp    | 112 ---------
 .../lib/asan/asan_win_common_runtime_thunk.h  |  38 ---
 compiler-rt/lib/asan/asan_win_dll_thunk.cpp   | 165 +++++++++++++
 .../asan/asan_win_dynamic_runtime_thunk.cpp   | 104 ++++++--
 .../asan/asan_win_static_runtime_thunk.cpp    | 110 ---------
 compiler-rt/lib/asan/tests/CMakeLists.txt     |   2 +-
 compiler-rt/lib/profile/CMakeLists.txt        |   6 -
 .../lib/sanitizer_common/CMakeLists.txt       |  55 ++++-
 .../sanitizer_common_interface.inc            |   6 -
 .../sanitizer_coverage_win_dll_thunk.cpp      |  20 ++
 ...er_coverage_win_dynamic_runtime_thunk.cpp} |  21 +-
 ...nitizer_coverage_win_weak_interception.cpp |  23 ++
 .../sanitizer_win_dll_thunk.cpp               | 101 ++++++++
 .../sanitizer_win_dll_thunk.h                 | 181 ++++++++++++++
 .../sanitizer_win_dynamic_runtime_thunk.cpp   |  26 ++
 .../sanitizer_win_immortalize.h               |  71 ------
 .../sanitizer_win_interception.cpp            | 156 ------------
 .../sanitizer_win_interception.h              |  32 ---
 .../sanitizer_win_thunk_interception.cpp      | 110 ---------
 .../sanitizer_win_thunk_interception.h        |  81 -------
 .../sanitizer_win_weak_interception.cpp       |  94 +++++++
 .../sanitizer_win_weak_interception.h         |  32 +++
 compiler-rt/lib/ubsan/CMakeLists.txt          |  27 ++-
 compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp |  20 ++
 ...pp => ubsan_win_dynamic_runtime_thunk.cpp} |  11 +-
 .../lib/ubsan/ubsan_win_weak_interception.cpp |  23 ++
 .../Darwin/interface_symbols_darwin.cpp       |   3 -
 .../Linux/interface_symbols_linux.cpp         |   3 -
 .../asan/TestCases/Windows/double_free.cpp    |   6 +-
 .../TestCases/Windows/free_hook_realloc.cpp   |   3 +
 .../TestCases/Windows/malloc_left_oob.cpp     |   2 +-
 .../TestCases/Windows/malloc_right_oob.cpp    |   2 +-
 .../asan/TestCases/Windows/malloc_uaf.cpp     |   4 +-
 .../TestCases/Windows/msvc/dll_and_lib.cpp    |   5 +-
 .../Windows/msvc/dll_large_function.cpp       |   3 +-
 .../TestCases/Windows/realloc_left_oob.cpp    |   2 +-
 .../TestCases/Windows/realloc_right_oob.cpp   |   2 +-
 .../asan/TestCases/Windows/realloc_uaf.cpp    |   4 +-
 .../asan/TestCases/Windows/symbols_path.cpp   |   2 +-
 .../asan/TestCases/Windows/unsymbolized.cpp   |   2 +-
 .../TestCases/Windows/use_after_realloc.cpp   |   4 +-
 .../test/asan/TestCases/debug_double_free.cpp |   3 +
 .../test/asan/TestCases/debug_report.cpp      |   3 +
 .../test/asan/TestCases/default_options.cpp   |   4 +
 .../test/asan/TestCases/on_error_callback.cpp |   3 +
 .../asan/TestCases/report_error_summary.cpp   |   3 +
 compiler-rt/test/asan/lit.cfg.py              |  23 +-
 56 files changed, 1069 insertions(+), 1287 deletions(-)
 delete mode 100644 compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
 delete mode 100644 compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
 delete mode 100644 compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
 create mode 100644 compiler-rt/lib/asan/asan_win_dll_thunk.cpp
 delete mode 100644 compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
 rename compiler-rt/lib/sanitizer_common/{sanitizer_coverage_win_runtime_thunk.cpp => sanitizer_coverage_win_dynamic_runtime_thunk.cpp} (59%)
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
 create mode 100644 compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
 rename compiler-rt/lib/ubsan/{ubsan_win_runtime_thunk.cpp => ubsan_win_dynamic_runtime_thunk.cpp} (62%)
 create mode 100644 compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 7b7fd2d9d47421..273f215ca94a88 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -909,16 +909,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         DiagnoseErrors);
   }
 
-  SharedRuntime = Args.hasFlag(
-      options::OPT_shared_libsan, options::OPT_static_libsan,
-      TC.getTriple().isAndroid() || TC.getTriple().isOSFuchsia() ||
-          TC.getTriple().isOSDarwin() || TC.getTriple().isOSWindows());
-  if (!SharedRuntime && TC.getTriple().isOSWindows()) {
-    Arg *A =
-        Args.getLastArg(options::OPT_shared_libsan, options::OPT_static_libsan);
-    D.Diag(clang::diag::err_drv_unsupported_opt_for_target)
-        << A->getSpelling() << TC.getTriple().str();
-  }
+  SharedRuntime =
+      Args.hasFlag(options::OPT_shared_libsan, options::OPT_static_libsan,
+                   TC.getTriple().isAndroid() || TC.getTriple().isOSFuchsia() ||
+                       TC.getTriple().isOSDarwin());
 
   ImplicitCfiRuntime = TC.getTriple().isAndroid();
 
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index bf54f04363851b..b7021d4b996ddd 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -201,10 +201,10 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (TC.getSanitizerArgs(Args).needsAsanRt()) {
     CmdArgs.push_back(Args.MakeArgString("-debug"));
     CmdArgs.push_back(Args.MakeArgString("-incremental:no"));
-    CmdArgs.push_back(TC.getCompilerRTArgString(Args, "asan_dynamic"));
-    auto defines = Args.getAllArgValues(options::OPT_D);
-    if (Args.hasArg(options::OPT__SLASH_MD, options::OPT__SLASH_MDd) ||
-        find(begin(defines), end(defines), "_DLL") != end(defines)) {
+    if (TC.getSanitizerArgs(Args).needsSharedRt() ||
+        Args.hasArg(options::OPT__SLASH_MD, options::OPT__SLASH_MDd)) {
+      for (const auto &Lib : {"asan_dynamic", "asan_dynamic_runtime_thunk"})
+        CmdArgs.push_back(TC.getCompilerRTArgString(Args, Lib));
       // Make sure the dynamic runtime thunk is not optimized out at link time
       // to ensure proper SEH handling.
       CmdArgs.push_back(Args.MakeArgString(
@@ -213,15 +213,19 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
               : "-include:__asan_seh_interceptor"));
       // Make sure the linker consider all object files from the dynamic runtime
       // thunk.
-      CmdArgs.push_back(Args.MakeArgString(
-          std::string("-wholearchive:") +
+      CmdArgs.push_back(Args.MakeArgString(std::string("-wholearchive:") +
           TC.getCompilerRT(Args, "asan_dynamic_runtime_thunk")));
+    } else if (DLL) {
+      CmdArgs.push_back(TC.getCompilerRTArgString(Args, "asan_dll_thunk"));
     } else {
-      // Make sure the linker consider all object files from the static runtime
-      // thunk.
-      CmdArgs.push_back(Args.MakeArgString(
-          std::string("-wholearchive:") +
-          TC.getCompilerRT(Args, "asan_static_runtime_thunk")));
+      for (const auto &Lib : {"asan", "asan_cxx"}) {
+        CmdArgs.push_back(TC.getCompilerRTArgString(Args, Lib));
+        // Make sure the linker consider all object files from the static lib.
+        // This is necessary because instrumented dlls need access to all the
+        // interface exported by the static lib in the main executable.
+        CmdArgs.push_back(Args.MakeArgString(std::string("-wholearchive:") +
+            TC.getCompilerRT(Args, Lib)));
+      }
     }
   }
 
diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c
index f5260442760452..ffd0b5ac4bade8 100644
--- a/clang/test/Driver/cl-link.c
+++ b/clang/test/Driver/cl-link.c
@@ -13,8 +13,10 @@
 // ASAN: link.exe
 // ASAN: "-debug"
 // ASAN: "-incremental:no"
-// ASAN: "{{[^"]*}}clang_rt.asan_dynamic.lib"
-// ASAN: "-wholearchive:{{.*}}clang_rt.asan_static_runtime_thunk.lib"
+// ASAN: "{{[^"]*}}clang_rt.asan.lib"
+// ASAN: "-wholearchive:{{.*}}clang_rt.asan.lib"
+// ASAN: "{{[^"]*}}clang_rt.asan_cxx.lib"
+// ASAN: "-wholearchive:{{.*}}clang_rt.asan_cxx.lib"
 // ASAN: "{{.*}}cl-link{{.*}}.obj"
 
 // RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s
@@ -22,6 +24,7 @@
 // ASAN-MD: "-debug"
 // ASAN-MD: "-incremental:no"
 // ASAN-MD: "{{.*}}clang_rt.asan_dynamic.lib"
+// ASAN-MD: "{{[^"]*}}clang_rt.asan_dynamic_runtime_thunk.lib"
 // ASAN-MD: "-include:___asan_seh_interceptor"
 // ASAN-MD: "-wholearchive:{{.*}}clang_rt.asan_dynamic_runtime_thunk.lib"
 // ASAN-MD: "{{.*}}cl-link{{.*}}.obj"
@@ -37,8 +40,7 @@
 // ASAN-DLL: "-dll"
 // ASAN-DLL: "-debug"
 // ASAN-DLL: "-incremental:no"
-// ASAN-DLL: "{{.*}}clang_rt.asan_dynamic.lib"
-// ASAN-DLL: "-wholearchive:{{.*}}clang_rt.asan_static_runtime_thunk.lib"
+// ASAN-DLL: "{{.*}}clang_rt.asan_dll_thunk.lib"
 // ASAN-DLL: "{{.*}}cl-link{{.*}}.obj"
 
 // RUN: %clang_cl /Zi /Tc%s -fuse-ld=link -### 2>&1 | FileCheck --check-prefix=DEBUG %s
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 158fa270c3f15a..6ce451e3cac2e3 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -378,12 +378,8 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "s390x")
 endif()
 
 if(MSVC)
-
-  # asan on windows only supports the release dll version of the runtimes, in the interest of
-  # only having one asan dll to support/test. Having asan statically linked
-  # with the runtime might be possible, but it multiplies the number of scenerios to test.
-  # the program USING sanitizers can use whatever version of the runtime it wants to.
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
+  # FIXME: In fact, sanitizers should support both /MT and /MD, see PR20214.
+  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
 
   # Remove any /M[DT][d] flags, and strip any definitions of _DEBUG.
   # Since we're using CMAKE_MSVC_RUNTIME_LIBRARY (CMP0091 set to NEW),
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index f992419c6d9822..463ea233b37aa4 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -32,20 +32,6 @@ set(ASAN_SOURCES
   asan_win.cpp
   )
 
-if(WIN32)
-  set(ASAN_DYNAMIC_RUNTIME_THUNK_SOURCES
-    asan_globals_win.cpp
-    asan_win_common_runtime_thunk.cpp
-    asan_win_dynamic_runtime_thunk.cpp
-    )
-  set(ASAN_STATIC_RUNTIME_THUNK_SOURCES
-    asan_globals_win.cpp
-    asan_malloc_win_thunk.cpp
-    asan_win_common_runtime_thunk.cpp
-    asan_win_static_runtime_thunk.cpp
-    )
-endif()
-
 if (NOT WIN32 AND NOT APPLE)
   list(APPEND ASAN_SOURCES
     asan_interceptors_vfork.S
@@ -150,7 +136,7 @@ append_list_if(MINGW "${MINGW_LIBRARIES}" ASAN_DYNAMIC_LIBS)
 add_compiler_rt_object_libraries(RTAsan_dynamic
   OS ${SANITIZER_COMMON_SUPPORTED_OS}
   ARCHS ${ASAN_SUPPORTED_ARCH}
-  SOURCES ${ASAN_SOURCES}
+  SOURCES ${ASAN_SOURCES} ${ASAN_CXX_SOURCES}
   ADDITIONAL_HEADERS ${ASAN_HEADERS}
   CFLAGS ${ASAN_DYNAMIC_CFLAGS}
   DEFS ${ASAN_DYNAMIC_DEFINITIONS})
@@ -235,52 +221,46 @@ else()
     RTSanitizerCommonSymbolizerInternal
     RTLSanCommon
     RTUbsan)
-  if (NOT WIN32)
-    add_compiler_rt_runtime(clang_rt.asan
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_preinit
-                  RTAsan
-                  ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
 
-    add_compiler_rt_runtime(clang_rt.asan_cxx
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_cxx
-                  RTUbsan_cxx
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
+  add_compiler_rt_runtime(clang_rt.asan
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_preinit
+                RTAsan
+                ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
 
-    add_compiler_rt_runtime(clang_rt.asan_static
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_static
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
+  add_compiler_rt_runtime(clang_rt.asan_cxx
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_cxx
+                RTUbsan_cxx
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
 
-    add_compiler_rt_runtime(clang_rt.asan-preinit
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_preinit
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
-  endif()
+  add_compiler_rt_runtime(clang_rt.asan_static
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_static
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
+
+  add_compiler_rt_runtime(clang_rt.asan-preinit
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_preinit
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
 
   foreach(arch ${ASAN_SUPPORTED_ARCH})
     if (COMPILER_RT_HAS_VERSION_SCRIPT)
-      if(WIN32)
-        set(SANITIZER_RT_VERSION_LIST_LIBS clang_rt.asan-${arch})
-      else()
-        set(SANITIZER_RT_VERSION_LIST_LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch})
-      endif()
       add_sanitizer_rt_version_list(clang_rt.asan-dynamic-${arch}
-                                    LIBS ${SANITIZER_RT_VERSION_LIST_LIBS}
+                                    LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch}
                                     EXTRA asan.syms.extra)
       set(VERSION_SCRIPT_FLAG
            -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers)
@@ -298,11 +278,25 @@ else()
     endif()
 
     set(ASAN_DYNAMIC_WEAK_INTERCEPTION)
+    if (WIN32)
+      add_compiler_rt_object_libraries(AsanWeakInterception
+        ${SANITIZER_COMMON_SUPPORTED_OS}
+        ARCHS ${arch}
+        SOURCES
+          asan_win_weak_interception.cpp
+        CFLAGS ${ASAN_CFLAGS} -DSANITIZER_DYNAMIC
+        DEFS ${ASAN_COMMON_DEFINITIONS})
+      set(ASAN_DYNAMIC_WEAK_INTERCEPTION
+          AsanWeakInterception
+          UbsanWeakInterception
+          SancovWeakInterception
+          SanitizerCommonWeakInterception)
+    endif()
+
     add_compiler_rt_runtime(clang_rt.asan
       SHARED
       ARCHS ${arch}
       OBJECT_LIBS ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
-              RTAsan_cxx
               RTAsan_dynamic
               # The only purpose of RTAsan_dynamic_version_script_dummy is to
               # carry a dependency of the shared runtime on the version script.
@@ -330,12 +324,36 @@ else()
     endif()
 
     if (WIN32)
+      add_compiler_rt_object_libraries(AsanDllThunk
+        ${SANITIZER_COMMON_SUPPORTED_OS}
+        ARCHS ${arch}
+        SOURCES asan_globals_win.cpp
+                asan_win_dll_thunk.cpp
+        CFLAGS ${ASAN_CFLAGS} -DSANITIZER_DLL_THUNK
+        DEFS ${ASAN_COMMON_DEFINITIONS})
+
+      add_compiler_rt_runtime(clang_rt.asan_dll_thunk
+        STATIC
+        ARCHS ${arch}
+        OBJECT_LIBS AsanDllThunk
+                    UbsanDllThunk
+                    SancovDllThunk
+                    SanitizerCommonDllThunk
+        SOURCES $<TARGET_OBJECTS:RTInterception.${arch}>
+        PARENT_TARGET asan)
+
       set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
+      if(MSVC)
+        list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
+      elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
+        list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
+      endif()
 
       add_compiler_rt_object_libraries(AsanDynamicRuntimeThunk
         ${SANITIZER_COMMON_SUPPORTED_OS}
         ARCHS ${arch}
-        SOURCES ${ASAN_DYNAMIC_RUNTIME_THUNK_SOURCES}
+        SOURCES asan_globals_win.cpp
+                asan_win_dynamic_runtime_thunk.cpp
         CFLAGS ${ASAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
         DEFS ${ASAN_COMMON_DEFINITIONS})
 
@@ -343,35 +361,12 @@ else()
         STATIC
         ARCHS ${arch}
         OBJECT_LIBS AsanDynamicRuntimeThunk
-                    UbsanRuntimeThunk
-                    SancovRuntimeThunk
-                    SanitizerRuntimeThunk
+                    UbsanDynamicRuntimeThunk
+                    SancovDynamicRuntimeThunk
+                    SanitizerCommonDynamicRuntimeThunk
         CFLAGS ${ASAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
         DEFS ${ASAN_COMMON_DEFINITIONS}
         PARENT_TARGET asan)
-
-      # mingw does not support static linkage of the CRT
-      if(NOT MINGW)
-        set(STATIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_STATIC_RUNTIME_THUNK")
-
-        add_compiler_rt_object_libraries(AsanStaticRuntimeThunk
-          ${SANITIZER_COMMON_SUPPORTED_OS}
-          ARCHS ${arch}
-          SOURCES ${ASAN_STATIC_RUNTIME_THUNK_SOURCES}
-          CFLAGS ${ASAN_DYNAMIC_CFLAGS} ${STATIC_RUNTIME_THUNK_CFLAGS}
-          DEFS ${ASAN_DYNAMIC_DEFINITIONS})
-
-        add_compiler_rt_runtime(clang_rt.asan_static_runtime_thunk
-          STATIC
-          ARCHS ${arch}
-          OBJECT_LIBS AsanStaticRuntimeThunk
-                      UbsanRuntimeThunk
-                      SancovRuntimeThunk
-                      SanitizerRuntimeThunk
-          CFLAGS ${ASAN_DYNAMIC_CFLAGS} ${STATIC_RUNTIME_THUNK_CFLAGS}
-          DEFS ${ASAN_DYNAMIC_DEFINITIONS}
-          PARENT_TARGET asan)
-      endif()
     endif()
   endforeach()
 endif()
diff --git a/compiler-rt/lib/asan/asan_flags.cpp b/compiler-rt/lib/asan/asan_flags.cpp
index 56deb1b0d082b8..23989843323211 100644
--- a/compiler-rt/lib/asan/asan_flags.cpp
+++ b/compiler-rt/lib/asan/asan_flags.cpp
@@ -11,16 +11,14 @@
 // ASan flag parsing logic.
 //===----------------------------------------------------------------------===//
 
-#include "asan_flags.h"
-
 #include "asan_activation.h"
+#include "asan_flags.h"
 #include "asan_interface_internal.h"
 #include "asan_stack.h"
 #include "lsan/lsan_common.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
 #include "sanitizer_common/sanitizer_flags.h"
-#include "sanitizer_common/sanitizer_win_interception.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
 #include "ubsan/ubsan_flags.h"
 #include "ubsan/ubsan_platform.h"
 
@@ -49,21 +47,7 @@ static void RegisterAsanFlags(FlagParser *parser, Flags *f) {
 #undef ASAN_FLAG
 }
 
-static void DisplayHelpMessages(FlagParser *parser) {
-  // TODO(eugenis): dump all flags at verbosity>=2?
-  if (Verbosity()) {
-    ReportUnrecognizedFlags();
-  }
-
-  if (common_flags()->help) {
-    parser->PrintFlagDescriptions();
-  }
-}
-
-static void InitializeDefaultFlags() {
-  Flags *f = flags();
-  FlagParser asan_parser;
-
+void InitializeFlags() {
   // Set the default values and prepare for parsing ASan and common flags.
   SetCommonFlagsDefaults();
   {
@@ -76,8 +60,10 @@ static void InitializeDefaultFlags() {
     cf.exitcode = 1;
     OverrideCommonFlags(cf);
   }
+  Flags *f = flags();
   f->SetDefaults();
 
+  FlagParser asan_parser;
   RegisterAsanFlags(&asan_parser, f);
   RegisterCommonFlags(&asan_parser);
 
@@ -140,12 +126,13 @@ static void InitializeDefaultFlags() {
 
   InitializeCommonFlags();
 
-  // TODO(samsonov): print all of the flags (ASan, LSan, common).
-  DisplayHelpMessages(&asan_parser);
-}
+  // TODO(eugenis): dump all flags at verbosity>=2?
+  if (Verbosity()) ReportUnrecognizedFlags();
 
-static void ProcessFlags() {
-  Flags *f = flags();
+  if (common_flags()->help) {
+    // TODO(samsonov): print all of the flags (ASan, LSan, common).
+    asan_parser.PrintFlagDescriptions();
+  }
 
   // Flag validation:
   if (!CAN_SANITIZE_LEAKS && common_flags()->detect_leaks) {
@@ -212,67 +199,6 @@ static void ProcessFlags() {
   }
 }
 
-void InitializeFlags() {
-  InitializeDefaultFlags();
-  ProcessFlags();
-
-#if SANITIZER_WINDOWS
-  // On Windows, weak symbols are emulated by having the user program
-  // register which weak functions are defined.
-  // The ASAN DLL will initialize flags prior to user module initialization,
-  // so __asan_default_options will not point to the user definition yet.
-  // We still want to ensure we capture when options are passed via
-  // __asan_default_options, so we add a callback to be run
-  // when it is registered with the runtime.
-
-  // There is theoretically time between the initial ProcessFlags and
-  // registering the weak callback where a weak function could be added and we
-  // would miss it, but in practice, InitializeFlags will always happen under
-  // the loader lock (if built as a DLL) and so will any calls to
-  // __sanitizer_register_weak_function.
-  AddRegisterWeakFunctionCallback(
-      reinterpret_cast<uptr>(__asan_default_options), []() {
-        FlagParser asan_parser;
-
-        RegisterAsanFlags(&asan_parser, flags());
-        RegisterCommonFlags(&asan_parser);
-        asan_parser.ParseString(__asan_default_options());
-
-        DisplayHelpMessages(&asan_parser);
-        ProcessFlags();
-      });
-
-#  if CAN_SANITIZE_UB
-  AddRegisterWeakFunctionCallback(
-      reinterpret_cast<uptr>(__ubsan_default_options), []() {
-        FlagParser ubsan_parser;
-
-        __ubsan::RegisterUbsanFlags(&ubsan_parser, __ubsan::flags());
-        RegisterCommonFlags(&ubsan_parser);
-        ubsan_parser.ParseString(__ubsan_default_options());
-
-        // To match normal behavior, do not print UBSan help.
-        ProcessFlags();
-      });
-#  endif
-
-#  if CAN_SANITIZE_LEAKS
-  AddRegisterWeakFunctionCallback(
-      reinterpret_cast<uptr>(__lsan_default_options), []() {
-        FlagParser lsan_parser;
-
-        __lsan::RegisterLsanFlags(&lsan_parser, __lsan::flags());
-        RegisterCommonFlags(&lsan_parser);
-        lsan_parser.ParseString(__lsan_default_options());
-
-        // To match normal behavior, do not print LSan help.
-        ProcessFlags();
-      });
-#  endif
-
-#endif
-}
-
 }  // namespace __asan
 
 SANITIZER_INTERFACE_WEAK_DEF(const char*, __asan_default_options, void) {
diff --git a/compiler-rt/lib/asan/asan_globals_win.cpp b/compiler-rt/lib/asan/asan_globals_win.cpp
index 8267f07b9cce49..19af88ab12b40a 100644
--- a/compiler-rt/lib/asan/asan_globals_win.cpp
+++ b/compiler-rt/lib/asan/asan_globals_win.cpp
@@ -28,9 +28,7 @@ static void call_on_globals(void (*hook)(__asan_global *, uptr)) {
   __asan_global *end = &__asan_globals_end;
   uptr bytediff = (uptr)end - (uptr)start;
   if (bytediff % sizeof(__asan_global) != 0) {
-#  if defined(SANITIZER_DLL_THUNK) ||             \
-      defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
-      defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#if defined(SANITIZER_DLL_THUNK) || defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
     __debugbreak();
 #else
     CHECK("corrupt asan global array");
diff --git a/compiler-rt/lib/asan/asan_malloc_win.cpp b/compiler-rt/lib/asan/asan_malloc_win.cpp
index 3278f072198769..7e1d04c36dd580 100644
--- a/compiler-rt/lib/asan/asan_malloc_win.cpp
+++ b/compiler-rt/lib/asan/asan_malloc_win.cpp
@@ -58,69 +58,97 @@ using namespace __asan;
 // MD: Memory allocation functions are defined in the CRT .dll,
 // so we have to intercept them before they are called for the first time.
 
+#if ASAN_DYNAMIC
+# define ALLOCATION_FUNCTION_ATTRIBUTE
+#else
+# define ALLOCATION_FUNCTION_ATTRIBUTE SANITIZER_INTERFACE_ATTRIBUTE
+#endif
+
 extern "C" {
-__declspec(noinline) size_t _msize(void *ptr) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+size_t _msize(void *ptr) {
   GET_CURRENT_PC_BP_SP;
   (void)sp;
   return asan_malloc_usable_size(ptr, pc, bp);
 }
 
-__declspec(noinline) size_t _msize_base(void *ptr) { return _msize(ptr); }
+ALLOCATION_FUNCTION_ATTRIBUTE
+size_t _msize_base(void *ptr) {
+  return _msize(ptr);
+}
 
-__declspec(noinline) void free(void *ptr) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void free(void *ptr) {
   GET_STACK_TRACE_FREE;
   return asan_free(ptr, &stack, FROM_MALLOC);
 }
 
-__declspec(noinline) void _free_dbg(void *ptr, int) { free(ptr); }
+ALLOCATION_FUNCTION_ATTRIBUTE
+void _free_dbg(void *ptr, int) {
+  free(ptr);
+}
 
-__declspec(noinline) void _free_base(void *ptr) { free(ptr); }
+ALLOCATION_FUNCTION_ATTRIBUTE
+void _free_base(void *ptr) {
+  free(ptr);
+}
 
-__declspec(noinline) void *malloc(size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *malloc(size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_malloc(size, &stack);
 }
 
-__declspec(noinline) void *_malloc_base(size_t size) { return malloc(size); }
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_malloc_base(size_t size) {
+  return malloc(size);
+}
 
-__declspec(noinline) void *_malloc_dbg(size_t size, int, const char *, int) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_malloc_dbg(size_t size, int, const char *, int) {
   return malloc(size);
 }
 
-__declspec(noinline) void *calloc(size_t nmemb, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *calloc(size_t nmemb, size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_calloc(nmemb, size, &stack);
 }
 
-__declspec(noinline) void *_calloc_base(size_t nmemb, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_calloc_base(size_t nmemb, size_t size) {
   return calloc(nmemb, size);
 }
 
-__declspec(noinline) void *_calloc_dbg(size_t nmemb, size_t size, int,
-                                       const char *, int) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_calloc_dbg(size_t nmemb, size_t size, int, const char *, int) {
   return calloc(nmemb, size);
 }
 
-__declspec(noinline) void *_calloc_impl(size_t nmemb, size_t size,
-                                        int *errno_tmp) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_calloc_impl(size_t nmemb, size_t size, int *errno_tmp) {
   return calloc(nmemb, size);
 }
 
-__declspec(noinline) void *realloc(void *ptr, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *realloc(void *ptr, size_t size) {
   GET_STACK_TRACE_MALLOC;
   return asan_realloc(ptr, size, &stack);
 }
 
-__declspec(noinline) void *_realloc_dbg(void *ptr, size_t size, int) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_realloc_dbg(void *ptr, size_t size, int) {
   UNREACHABLE("_realloc_dbg should not exist!");
   return 0;
 }
 
-__declspec(noinline) void *_realloc_base(void *ptr, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_realloc_base(void *ptr, size_t size) {
   return realloc(ptr, size);
 }
 
-__declspec(noinline) void *_recalloc(void *p, size_t n, size_t elem_size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_recalloc(void *p, size_t n, size_t elem_size) {
   if (!p)
     return calloc(n, elem_size);
   const size_t size = n * elem_size;
@@ -138,41 +166,23 @@ __declspec(noinline) void *_recalloc(void *p, size_t n, size_t elem_size) {
   return new_alloc;
 }
 
-__declspec(noinline) void *_recalloc_base(void *p, size_t n, size_t elem_size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_recalloc_base(void *p, size_t n, size_t elem_size) {
   return _recalloc(p, n, elem_size);
 }
 
-__declspec(noinline) void *_expand(void *memblock, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_expand(void *memblock, size_t size) {
   // _expand is used in realloc-like functions to resize the buffer if possible.
   // We don't want memory to stand still while resizing buffers, so return 0.
   return 0;
 }
 
-__declspec(noinline) void *_expand_dbg(void *memblock, size_t size) {
+ALLOCATION_FUNCTION_ATTRIBUTE
+void *_expand_dbg(void *memblock, size_t size) {
   return _expand(memblock, size);
 }
 
-__declspec(dllexport) size_t __cdecl __asan_msize(void *ptr) {
-  return _msize(ptr);
-}
-__declspec(dllexport) void __cdecl __asan_free(void *const ptr) { free(ptr); }
-__declspec(dllexport) void *__cdecl __asan_malloc(const size_t size) {
-  return malloc(size);
-}
-__declspec(dllexport) void *__cdecl __asan_calloc(const size_t nmemb,
-                                                  const size_t size) {
-  return calloc(nmemb, size);
-}
-__declspec(dllexport) void *__cdecl __asan_realloc(void *const ptr,
-                                                   const size_t size) {
-  return realloc(ptr, size);
-}
-__declspec(dllexport) void *__cdecl __asan_recalloc(void *const ptr,
-                                                    const size_t nmemb,
-                                                    const size_t size) {
-  return _recalloc(ptr, nmemb, size);
-}
-
 // TODO(timurrrr): Might want to add support for _aligned_* allocation
 // functions to detect a bit more bugs.  Those functions seem to wrap malloc().
 
@@ -477,6 +487,7 @@ static void TryToOverrideFunction(const char *fname, uptr new_func) {
 }
 
 void ReplaceSystemMalloc() {
+#if defined(ASAN_DYNAMIC)
   TryToOverrideFunction("free", (uptr)free);
   TryToOverrideFunction("_free_base", (uptr)free);
   TryToOverrideFunction("malloc", (uptr)malloc);
@@ -532,6 +543,8 @@ void ReplaceSystemMalloc() {
   // allocation API will be directed to ASan's heap. We don't currently
   // intercept all calls to HeapAlloc. If we did, we would have to check on
   // HeapFree whether the pointer came from ASan of from the system.
+
+#endif  // defined(ASAN_DYNAMIC)
 }
 }  // namespace __asan
 
diff --git a/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp b/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
deleted file mode 100644
index abf515b77c4a9f..00000000000000
--- a/compiler-rt/lib/asan/asan_malloc_win_thunk.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-//===-- asan_malloc_win_thunk.cpp
-//-----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-//
-// Windows-specific malloc interception.
-// This is included statically for projects statically linking
-// with the C Runtime (/MT, /MTd) in order to provide ASAN-aware
-// versions of the C allocation functions.
-//===----------------------------------------------------------------------===//
-
-#ifdef SANITIZER_STATIC_RUNTIME_THUNK
-#  include "..\sanitizer_common\sanitizer_allocator_interface.h"
-// #include "asan_win_thunk_common.h"
-
-// Preserve stack traces with noinline.
-#  define STATIC_MALLOC_INTERFACE __declspec(noinline)
-
-extern "C" {
-__declspec(dllimport) size_t __cdecl __asan_msize(void *ptr);
-__declspec(dllimport) void __cdecl __asan_free(void *const ptr);
-__declspec(dllimport) void *__cdecl __asan_malloc(const size_t size);
-__declspec(dllimport) void *__cdecl __asan_calloc(const size_t nmemb,
-                                                  const size_t size);
-__declspec(dllimport) void *__cdecl __asan_realloc(void *const ptr,
-                                                   const size_t size);
-__declspec(dllimport) void *__cdecl __asan_recalloc(void *const ptr,
-                                                    const size_t nmemb,
-                                                    const size_t size);
-
-// Avoid tailcall optimization to preserve stack frames.
-#  pragma optimize("", off)
-
-// _msize
-STATIC_MALLOC_INTERFACE size_t _msize(void *ptr) { return __asan_msize(ptr); }
-
-STATIC_MALLOC_INTERFACE size_t _msize_base(void *ptr) {
-  return __asan_msize(ptr);
-}
-
-STATIC_MALLOC_INTERFACE size_t _msize_dbg(void *ptr) {
-  return __asan_msize(ptr);
-}
-
-// free
-STATIC_MALLOC_INTERFACE void free(void *const ptr) { return __asan_free(ptr); }
-
-STATIC_MALLOC_INTERFACE void _free_base(void *const ptr) {
-  return __asan_free(ptr);
-}
-
-STATIC_MALLOC_INTERFACE void _free_dbg(void *const ptr) {
-  return __asan_free(ptr);
-}
-
-// malloc
-STATIC_MALLOC_INTERFACE void *malloc(const size_t size) {
-  return __asan_malloc(size);
-}
-
-STATIC_MALLOC_INTERFACE void *_malloc_base(const size_t size) {
-  return __asan_malloc(size);
-}
-
-STATIC_MALLOC_INTERFACE void *_malloc_dbg(const size_t size) {
-  return __asan_malloc(size);
-}
-
-// calloc
-STATIC_MALLOC_INTERFACE void *calloc(const size_t nmemb, const size_t size) {
-  return __asan_calloc(nmemb, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_calloc_base(const size_t nmemb,
-                                           const size_t size) {
-  return __asan_calloc(nmemb, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_calloc_impl(const size_t nmemb,
-                                           const size_t size,
-                                           int *const errno_tmp) {
-  // Provided by legacy msvcrt.
-  (void)errno_tmp;
-
-  return __asan_calloc(nmemb, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_calloc_dbg(const size_t nmemb, const size_t size,
-                                          int, const char *, int) {
-  return __asan_calloc(nmemb, size);
-}
-
-// realloc
-STATIC_MALLOC_INTERFACE void *realloc(void *const ptr, const size_t size) {
-  return __asan_realloc(ptr, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_realloc_base(void *const ptr,
-                                            const size_t size) {
-  return __asan_realloc(ptr, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_realloc_dbg(void *const ptr, const size_t size,
-                                           int, const char *, int) {
-  return __asan_realloc(ptr, size);
-}
-
-// recalloc
-STATIC_MALLOC_INTERFACE void *_recalloc(void *const ptr, const size_t nmemb,
-                                        const size_t size) {
-  return __asan_recalloc(ptr, nmemb, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_recalloc_base(void *const ptr,
-                                             const size_t nmemb,
-                                             const size_t size) {
-  return __asan_recalloc(ptr, nmemb, size);
-}
-
-STATIC_MALLOC_INTERFACE void *_recalloc_dbg(void *const ptr, const size_t nmemb,
-                                            const size_t size, int,
-                                            const char *, int) {
-  return __asan_recalloc(ptr, nmemb, size);
-}
-
-// expand
-STATIC_MALLOC_INTERFACE void *_expand(void *, size_t) {
-  // _expand is used in realloc-like functions to resize the buffer if possible.
-  // We don't want memory to stand still while resizing buffers, so return 0.
-  return nullptr;
-}
-
-STATIC_MALLOC_INTERFACE void *_expand_dbg(void *, size_t, int, const char *,
-                                          int) {
-  return nullptr;
-}
-
-// We need to provide symbols for all the debug CRT functions if we decide to
-// provide any. Most of these functions make no sense under ASan and so we
-// make them no-ops.
-long _CrtSetBreakAlloc(long const) { return ~0; }
-
-void _CrtSetDbgBlockType(void *const, int const) { return; }
-
-typedef int(__cdecl *CRT_ALLOC_HOOK)(int, void *, size_t, int, long,
-                                     const unsigned char *, int);
-
-CRT_ALLOC_HOOK _CrtGetAllocHook() { return nullptr; }
-
-CRT_ALLOC_HOOK _CrtSetAllocHook(CRT_ALLOC_HOOK const hook) { return hook; }
-
-int _CrtCheckMemory() { return 1; }
-
-int _CrtSetDbgFlag(int const new_bits) { return new_bits; }
-
-typedef void (*CrtDoForAllClientObjectsCallback)(void *, void *);
-
-void _CrtDoForAllClientObjects(CrtDoForAllClientObjectsCallback const,
-                               void *const) {
-  return;
-}
-
-int _CrtIsValidPointer(void const *const p, unsigned int const, int const) {
-  return p != nullptr;
-}
-
-int _CrtIsValidHeapPointer(void const *const block) {
-  if (!block) {
-    return 0;
-  }
-
-  return __sanitizer_get_ownership(block);
-}
-
-int _CrtIsMemoryBlock(void const *const, unsigned const, long *const,
-                      char **const, int *const) {
-  return 0;
-}
-
-int _CrtReportBlockType(void const *const) { return -1; }
-
-typedef void(__cdecl *CRT_DUMP_CLIENT)(void *, size_t);
-
-CRT_DUMP_CLIENT _CrtGetDumpClient() { return nullptr; }
-
-CRT_DUMP_CLIENT _CrtSetDumpClient(CRT_DUMP_CLIENT new_client) {
-  return new_client;
-}
-
-void _CrtMemCheckpoint(void *const) { return; }
-
-int _CrtMemDifference(void *const, void const *const, void const *const) {
-  return 0;
-}
-
-void _CrtMemDumpAllObjectsSince(void const *const) { return; }
-
-int _CrtDumpMemoryLeaks() { return 0; }
-
-void _CrtMemDumpStatistics(void const *const) { return; }
-
-int _crtDbgFlag{0};
-long _crtBreakAlloc{-1};
-CRT_DUMP_CLIENT _pfnDumpClient{nullptr};
-
-int *__p__crtDbgFlag() { return &_crtDbgFlag; }
-
-long *__p__crtBreakAlloc() { return &_crtBreakAlloc; }
-
-// TODO: These were added upstream but conflict with definitions in ucrtbased.
-// int _CrtDbgReport(int, const char *, int, const char *, const char *, ...) {
-//   ShowStatsAndAbort();
-// }
-//
-// int _CrtDbgReportW(int reportType, const wchar_t *, int, const wchar_t *,
-//                    const wchar_t *, ...) {
-//   ShowStatsAndAbort();
-// }
-//
-// int _CrtSetReportMode(int, int) { return 0; }
-
-}  // extern "C"
-#endif  // SANITIZER_STATIC_RUNTIME_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
deleted file mode 100644
index d2c9e66c313379..00000000000000
--- a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//===-- asan_win_common_runtime_thunk.cpp --------------------------- -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-//
-// This file defines things that need to be present in the application modules
-// to interact with the ASan DLL runtime correctly and can't be implemented
-// using the default "import library" generated when linking the DLL.
-//
-// This includes:
-//  - Cloning shadow memory dynamic address from ASAN DLL
-//  - Creating weak aliases to default implementation imported from asan dll
-//  - Forwarding the detect_stack_use_after_return runtime option
-//  - installing a custom SEH handler
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
-    defined(SANITIZER_STATIC_RUNTIME_THUNK)
-#  define SANITIZER_IMPORT_INTERFACE 1
-#  define WIN32_LEAN_AND_MEAN
-#  include "asan_win_common_runtime_thunk.h"
-
-#  include <windows.h>
-
-#  include "sanitizer_common/sanitizer_win_defs.h"
-#  include "sanitizer_common/sanitizer_win_thunk_interception.h"
-
-// Define weak alias for all weak functions imported from asan dll.
-#  define INTERFACE_FUNCTION(Name)
-#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
-#  include "asan_interface.inc"
-
-////////////////////////////////////////////////////////////////////////////////
-// Define a copy of __asan_option_detect_stack_use_after_return that should be
-// used when linking an MD runtime with a set of object files on Windows.
-//
-// The ASan MD runtime dllexports '__asan_option_detect_stack_use_after_return',
-// so normally we would just dllimport it.  Unfortunately, the dllimport
-// attribute adds __imp_ prefix to the symbol name of a variable.
-// Since in general we don't know if a given TU is going to be used
-// with a MT or MD runtime and we don't want to use ugly __imp_ names on Windows
-// just to work around this issue, let's clone the variable that is constant
-// after initialization anyways.
-
-extern "C" {
-__declspec(dllimport) int __asan_should_detect_stack_use_after_return();
-int __asan_option_detect_stack_use_after_return;
-
-__declspec(dllimport) void *__asan_get_shadow_memory_dynamic_address();
-void *__asan_shadow_memory_dynamic_address;
-
-static void __asan_initialize_cloned_variables() {
-  __asan_option_detect_stack_use_after_return =
-      __asan_should_detect_stack_use_after_return();
-  __asan_shadow_memory_dynamic_address =
-      __asan_get_shadow_memory_dynamic_address();
-}
-}
-
-static int asan_thunk_init() {
-  __asan_initialize_cloned_variables();
-
-#  ifdef SANITIZER_STATIC_RUNTIME_THUNK
-  __asan_initialize_static_thunk();
-#  endif
-
-  return 0;
-}
-
-static void WINAPI asan_thread_init(void *mod, unsigned long reason,
-                                    void *reserved) {
-  if (reason == DLL_PROCESS_ATTACH) {
-    asan_thunk_init();
-  }
-}
-
-// Our cloned variables must be initialized before C/C++ constructors.  If TLS
-// is used, our .CRT$XLAB initializer will run first. If not, our .CRT$XIB
-// initializer is needed as a backup.
-extern "C" __declspec(allocate(".CRT$XIB")) int (*__asan_thunk_init)() =
-    asan_thunk_init;
-WIN_FORCE_LINK(__asan_thunk_init);
-
-extern "C" __declspec(allocate(".CRT$XLAB")) void(WINAPI *__asan_tls_init)(
-    void *, unsigned long, void *) = asan_thread_init;
-WIN_FORCE_LINK(__asan_tls_init);
-
-////////////////////////////////////////////////////////////////////////////////
-// ASan SEH handling.
-// We need to set the ASan-specific SEH handler at the end of CRT initialization
-// of each module (see also asan_win.cpp).
-extern "C" {
-__declspec(dllimport) int __asan_set_seh_filter();
-static int SetSEHFilter() { return __asan_set_seh_filter(); }
-
-// Unfortunately, putting a pointer to __asan_set_seh_filter into
-// __asan_intercept_seh gets optimized out, so we have to use an extra function.
-extern "C" __declspec(allocate(".CRT$XCAB")) int (*__asan_seh_interceptor)() =
-    SetSEHFilter;
-WIN_FORCE_LINK(__asan_seh_interceptor);
-}
-
-WIN_FORCE_LINK(__asan_dso_reg_hook)
-
-#endif  // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
-        // defined(SANITIZER_STATIC_RUNTIME_THUNK)
diff --git a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h b/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
deleted file mode 100644
index 66285eb31ae994..00000000000000
--- a/compiler-rt/lib/asan/asan_win_common_runtime_thunk.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- asan_win_common_runtime_thunk.h -------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-//
-// This file defines things that need to be present in the application modules
-// to interact with the ASan DLL runtime correctly and can't be implemented
-// using the default "import library" generated when linking the DLL.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(SANITIZER_STATIC_RUNTIME_THUNK) || \
-    defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
-#  include "sanitizer_common/sanitizer_win_defs.h"
-
-#  pragma section(".CRT$XIB", long, \
-                  read)  // C initializer (during C init before dyninit)
-#  pragma section(".CRT$XID", long, \
-                  read)  // First C initializer after CRT initializers
-#  pragma section(".CRT$XCAB", long, \
-                  read)  // First C++ initializer after startup initializers
-
-#  pragma section(".CRT$XTW", long, read)  // First ASAN globals terminator
-#  pragma section(".CRT$XTY", long, read)  // Last ASAN globals terminator
-
-#  pragma section(".CRT$XLAB", long, read)  // First TLS initializer
-
-#  ifdef SANITIZER_STATIC_RUNTIME_THUNK
-extern "C" void __asan_initialize_static_thunk();
-#  endif
-
-#endif  // defined(SANITIZER_STATIC_RUNTIME_THUNK) ||
-        // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
\ No newline at end of file
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
new file mode 100644
index 00000000000000..35871a942a7a12
--- /dev/null
+++ b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
@@ -0,0 +1,165 @@
+//===-- asan_win_dll_thunk.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// This file defines a family of thunks that should be statically linked into
+// the DLLs that have ASan instrumentation in order to delegate the calls to the
+// shared runtime that lives in the main binary.
+// See https://github.com/google/sanitizers/issues/209 for the details.
+//===----------------------------------------------------------------------===//
+
+#ifdef SANITIZER_DLL_THUNK
+#include "asan_init_version.h"
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_win_defs.h"
+#include "sanitizer_common/sanitizer_win_dll_thunk.h"
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
+
+// ASan own interface functions.
+#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "asan_interface.inc"
+
+// Memory allocation functions.
+INTERCEPT_WRAP_V_W(free)
+INTERCEPT_WRAP_V_W(_free_base)
+INTERCEPT_WRAP_V_WW(_free_dbg)
+
+INTERCEPT_WRAP_W_W(malloc)
+INTERCEPT_WRAP_W_W(_malloc_base)
+INTERCEPT_WRAP_W_WWWW(_malloc_dbg)
+
+INTERCEPT_WRAP_W_WW(calloc)
+INTERCEPT_WRAP_W_WW(_calloc_base)
+INTERCEPT_WRAP_W_WWWWW(_calloc_dbg)
+INTERCEPT_WRAP_W_WWW(_calloc_impl)
+
+INTERCEPT_WRAP_W_WW(realloc)
+INTERCEPT_WRAP_W_WW(_realloc_base)
+INTERCEPT_WRAP_W_WWW(_realloc_dbg)
+INTERCEPT_WRAP_W_WWW(_recalloc)
+INTERCEPT_WRAP_W_WWW(_recalloc_base)
+
+INTERCEPT_WRAP_W_W(_msize)
+INTERCEPT_WRAP_W_W(_msize_base)
+INTERCEPT_WRAP_W_W(_expand)
+INTERCEPT_WRAP_W_W(_expand_dbg)
+
+// TODO(timurrrr): Might want to add support for _aligned_* allocation
+// functions to detect a bit more bugs.  Those functions seem to wrap malloc().
+
+// TODO(timurrrr): Do we need to add _Crt* stuff here? (see asan_malloc_win.cpp)
+
+#  if defined(_MSC_VER) && !defined(__clang__)
+// Disable warnings such as: 'void memchr(void)': incorrect number of arguments
+// for intrinsic function, expected '3' arguments.
+#    pragma warning(push)
+#    pragma warning(disable : 4392)
+#  endif
+
+INTERCEPT_LIBRARY_FUNCTION(atoi);
+INTERCEPT_LIBRARY_FUNCTION(atol);
+INTERCEPT_LIBRARY_FUNCTION(atoll);
+INTERCEPT_LIBRARY_FUNCTION(frexp);
+INTERCEPT_LIBRARY_FUNCTION(longjmp);
+#if SANITIZER_INTERCEPT_MEMCHR
+INTERCEPT_LIBRARY_FUNCTION(memchr);
+#endif
+INTERCEPT_LIBRARY_FUNCTION(memcmp);
+INTERCEPT_LIBRARY_FUNCTION(memcpy);
+INTERCEPT_LIBRARY_FUNCTION(memmove);
+INTERCEPT_LIBRARY_FUNCTION(memset);
+INTERCEPT_LIBRARY_FUNCTION(strcat);
+INTERCEPT_LIBRARY_FUNCTION(strchr);
+INTERCEPT_LIBRARY_FUNCTION(strcmp);
+INTERCEPT_LIBRARY_FUNCTION(strcpy);
+INTERCEPT_LIBRARY_FUNCTION(strcspn);
+INTERCEPT_LIBRARY_FUNCTION(_strdup);
+INTERCEPT_LIBRARY_FUNCTION(strlen);
+INTERCEPT_LIBRARY_FUNCTION(strncat);
+INTERCEPT_LIBRARY_FUNCTION(strncmp);
+INTERCEPT_LIBRARY_FUNCTION(strncpy);
+INTERCEPT_LIBRARY_FUNCTION(strnlen);
+INTERCEPT_LIBRARY_FUNCTION(strpbrk);
+INTERCEPT_LIBRARY_FUNCTION(strrchr);
+INTERCEPT_LIBRARY_FUNCTION(strspn);
+INTERCEPT_LIBRARY_FUNCTION(strstr);
+INTERCEPT_LIBRARY_FUNCTION(strtok);
+INTERCEPT_LIBRARY_FUNCTION(strtol);
+INTERCEPT_LIBRARY_FUNCTION(strtoll);
+INTERCEPT_LIBRARY_FUNCTION(wcslen);
+INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
+
+#  if defined(_MSC_VER) && !defined(__clang__)
+#    pragma warning(pop)
+#  endif
+
+#ifdef _WIN64
+INTERCEPT_LIBRARY_FUNCTION(__C_specific_handler);
+#else
+INTERCEPT_LIBRARY_FUNCTION(_except_handler3);
+// _except_handler4 checks -GS cookie which is different for each module, so we
+// can't use INTERCEPT_LIBRARY_FUNCTION(_except_handler4).
+INTERCEPTOR(int, _except_handler4, void *a, void *b, void *c, void *d) {
+  __asan_handle_no_return();
+  return REAL(_except_handler4)(a, b, c, d);
+}
+#endif
+
+// Windows specific functions not included in asan_interface.inc.
+INTERCEPT_WRAP_W_V(__asan_should_detect_stack_use_after_return)
+INTERCEPT_WRAP_W_V(__asan_get_shadow_memory_dynamic_address)
+INTERCEPT_WRAP_W_W(__asan_unhandled_exception_filter)
+
+using namespace __sanitizer;
+
+extern "C" {
+int __asan_option_detect_stack_use_after_return;
+uptr __asan_shadow_memory_dynamic_address;
+} // extern "C"
+
+static int asan_dll_thunk_init() {
+  typedef void (*fntype)();
+  static fntype fn = 0;
+  // asan_dll_thunk_init is expected to be called by only one thread.
+  if (fn) return 0;
+
+  // Ensure all interception was executed.
+  __dll_thunk_init();
+
+  fn = (fntype) dllThunkGetRealAddrOrDie("__asan_init");
+  fn();
+  __asan_option_detect_stack_use_after_return =
+      (__asan_should_detect_stack_use_after_return() != 0);
+  __asan_shadow_memory_dynamic_address =
+      (uptr)__asan_get_shadow_memory_dynamic_address();
+
+#ifndef _WIN64
+  INTERCEPT_FUNCTION(_except_handler4);
+#endif
+  // In DLLs, the callbacks are expected to return 0,
+  // otherwise CRT initialization fails.
+  return 0;
+}
+
+#pragma section(".CRT$XIB", long, read)
+__declspec(allocate(".CRT$XIB")) int (*__asan_preinit)() = asan_dll_thunk_init;
+
+static void WINAPI asan_thread_init(void *mod, unsigned long reason,
+                                    void *reserved) {
+  if (reason == /*DLL_PROCESS_ATTACH=*/1) asan_dll_thunk_init();
+}
+
+#pragma section(".CRT$XLAB", long, read)
+__declspec(allocate(".CRT$XLAB")) void (WINAPI *__asan_tls_init)(void *,
+    unsigned long, void *) = asan_thread_init;
+
+WIN_FORCE_LINK(__asan_dso_reg_hook)
+
+#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
index 421fe651b7d919..f0b5ec9eef7f99 100644
--- a/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dynamic_runtime_thunk.cpp
@@ -8,17 +8,76 @@
 //
 // This file is a part of AddressSanitizer, an address sanity checker.
 //
-// This file defines things that need to be present for application modules
-// that are dynamic linked with the C Runtime.
+// This file defines things that need to be present in the application modules
+// to interact with the ASan DLL runtime correctly and can't be implemented
+// using the default "import library" generated when linking the DLL RTL.
+//
+// This includes:
+//  - creating weak aliases to default implementation imported from asan dll.
+//  - forwarding the detect_stack_use_after_return runtime option
+//  - working around deficiencies of the MD runtime
+//  - installing a custom SEH handler
 //
 //===----------------------------------------------------------------------===//
 
 #ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
-#  define WIN32_LEAN_AND_MEAN
-#  include <windows.h>
+#define SANITIZER_IMPORT_INTERFACE 1
+#include "sanitizer_common/sanitizer_win_defs.h"
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+// Define weak alias for all weak functions imported from asan dll.
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
+#include "asan_interface.inc"
+
+// First, declare CRT sections we'll be using in this file
+#pragma section(".CRT$XIB", long, read)
+#pragma section(".CRT$XID", long, read)
+#pragma section(".CRT$XCAB", long, read)
+#pragma section(".CRT$XTW", long, read)
+#pragma section(".CRT$XTY", long, read)
+#pragma section(".CRT$XLAB", long, read)
+
+////////////////////////////////////////////////////////////////////////////////
+// Define a copy of __asan_option_detect_stack_use_after_return that should be
+// used when linking an MD runtime with a set of object files on Windows.
+//
+// The ASan MD runtime dllexports '__asan_option_detect_stack_use_after_return',
+// so normally we would just dllimport it.  Unfortunately, the dllimport
+// attribute adds __imp_ prefix to the symbol name of a variable.
+// Since in general we don't know if a given TU is going to be used
+// with a MT or MD runtime and we don't want to use ugly __imp_ names on Windows
+// just to work around this issue, let's clone the variable that is constant
+// after initialization anyways.
+extern "C" {
+__declspec(dllimport) int __asan_should_detect_stack_use_after_return();
+int __asan_option_detect_stack_use_after_return;
+
+__declspec(dllimport) void* __asan_get_shadow_memory_dynamic_address();
+void* __asan_shadow_memory_dynamic_address;
+}
+
+static int InitializeClonedVariables() {
+  __asan_option_detect_stack_use_after_return =
+    __asan_should_detect_stack_use_after_return();
+  __asan_shadow_memory_dynamic_address =
+    __asan_get_shadow_memory_dynamic_address();
+  return 0;
+}
+
+static void NTAPI asan_thread_init(void *mod, unsigned long reason,
+    void *reserved) {
+  if (reason == DLL_PROCESS_ATTACH) InitializeClonedVariables();
+}
 
-#  include "asan_win_common_runtime_thunk.h"
-#  include "sanitizer_common/sanitizer_win_defs.h"
+// Our cloned variables must be initialized before C/C++ constructors.  If TLS
+// is used, our .CRT$XLAB initializer will run first. If not, our .CRT$XIB
+// initializer is needed as a backup.
+__declspec(allocate(".CRT$XIB")) int (*__asan_initialize_cloned_variables)() =
+    InitializeClonedVariables;
+__declspec(allocate(".CRT$XLAB")) void (NTAPI *__asan_tls_init)(void *,
+    unsigned long, void *) = asan_thread_init;
 
 ////////////////////////////////////////////////////////////////////////////////
 // For some reason, the MD CRT doesn't call the C/C++ terminators during on DLL
@@ -29,26 +88,43 @@
 // using atexit() that calls a small subset of C terminators
 // where LLVM global_dtors is placed.  Fingers crossed, no other C terminators
 // are there.
-extern "C" int __cdecl atexit(void(__cdecl *f)(void));
+extern "C" int __cdecl atexit(void (__cdecl *f)(void));
 extern "C" void __cdecl _initterm(void *a, void *b);
 
 namespace {
-__declspec(allocate(".CRT$XTW")) void *before_global_dtors = 0;
-__declspec(allocate(".CRT$XTY")) void *after_global_dtors = 0;
+__declspec(allocate(".CRT$XTW")) void* before_global_dtors = 0;
+__declspec(allocate(".CRT$XTY")) void* after_global_dtors = 0;
 
 void UnregisterGlobals() {
   _initterm(&before_global_dtors, &after_global_dtors);
 }
 
-int ScheduleUnregisterGlobals() { return atexit(UnregisterGlobals); }
+int ScheduleUnregisterGlobals() {
+  return atexit(UnregisterGlobals);
+}
 }  // namespace
 
 // We need to call 'atexit(UnregisterGlobals);' as early as possible, but after
 // atexit() is initialized (.CRT$XIC).  As this is executed before C++
 // initializers (think ctors for globals), UnregisterGlobals gets executed after
 // dtors for C++ globals.
-extern "C" __declspec(allocate(".CRT$XID")) int (
-    *__asan_schedule_unregister_globals)() = ScheduleUnregisterGlobals;
-WIN_FORCE_LINK(__asan_schedule_unregister_globals)
+__declspec(allocate(".CRT$XID"))
+int (*__asan_schedule_unregister_globals)() = ScheduleUnregisterGlobals;
+
+////////////////////////////////////////////////////////////////////////////////
+// ASan SEH handling.
+// We need to set the ASan-specific SEH handler at the end of CRT initialization
+// of each module (see also asan_win.cpp).
+extern "C" {
+__declspec(dllimport) int __asan_set_seh_filter();
+static int SetSEHFilter() { return __asan_set_seh_filter(); }
+
+// Unfortunately, putting a pointer to __asan_set_seh_filter into
+// __asan_intercept_seh gets optimized out, so we have to use an extra function.
+__declspec(allocate(".CRT$XCAB")) int (*__asan_seh_interceptor)() =
+    SetSEHFilter;
+}
+
+WIN_FORCE_LINK(__asan_dso_reg_hook)
 
-#endif  // SANITIZER_DYNAMIC_RUNTIME_THUNK
+#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
diff --git a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
deleted file mode 100644
index dec50a5e1d4d9e..00000000000000
--- a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- asan_win_static_runtime_thunk.cpp ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-//
-// This file defines a family of thunks that should be statically linked into
-// modules that are statically linked with the C Runtime in order to delegate
-// the calls to the ASAN runtime DLL.
-// See https://github.com/google/sanitizers/issues/209 for the details.
-//===----------------------------------------------------------------------===//
-
-#ifdef SANITIZER_STATIC_RUNTIME_THUNK
-#  include "asan_init_version.h"
-#  include "asan_interface_internal.h"
-#  include "asan_win_common_runtime_thunk.h"
-#  include "sanitizer_common/sanitizer_platform_interceptors.h"
-#  include "sanitizer_common/sanitizer_win_defs.h"
-#  include "sanitizer_common/sanitizer_win_thunk_interception.h"
-
-#  if defined(_MSC_VER) && !defined(__clang__)
-// Disable warnings such as: 'void memchr(void)': incorrect number of arguments
-// for intrinsic function, expected '3' arguments.
-#    pragma warning(push)
-#    pragma warning(disable : 4392)
-#  endif
-
-#  define INTERCEPT_LIBRARY_FUNCTION_ASAN(X) \
-    INTERCEPT_LIBRARY_FUNCTION(X, "__asan_wrap_" #X)
-
-INTERCEPT_LIBRARY_FUNCTION_ASAN(atoi);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(atol);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(atoll);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(frexp);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(longjmp);
-#  if SANITIZER_INTERCEPT_MEMCHR
-INTERCEPT_LIBRARY_FUNCTION_ASAN(memchr);
-#  endif
-INTERCEPT_LIBRARY_FUNCTION_ASAN(memcmp);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(memcpy);
-#  ifndef _WIN64
-// memmove and memcpy share an implementation on amd64
-INTERCEPT_LIBRARY_FUNCTION_ASAN(memmove);
-#  endif
-INTERCEPT_LIBRARY_FUNCTION_ASAN(memset);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strcat);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strchr);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strcmp);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strcpy);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strcspn);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(_strdup);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strlen);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strncat);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strncmp);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strncpy);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strnlen);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strpbrk);
-// INTERCEPT_LIBRARY_FUNCTION_ASAN(strrchr);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strspn);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strstr);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strtok);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(strtol);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcslen);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsnlen);
-
-#  if defined(_MSC_VER) && !defined(__clang__)
-#    pragma warning(pop)
-#  endif
-
-#  ifdef _WIN64
-INTERCEPT_LIBRARY_FUNCTION_ASAN(__C_specific_handler);
-#  else
-extern "C" void abort();
-INTERCEPT_LIBRARY_FUNCTION_ASAN(_except_handler3);
-// _except_handler4 checks -GS cookie which is different for each module, so we
-// can't use INTERCEPT_LIBRARY_FUNCTION_ASAN(_except_handler4), need to apply
-// manually
-extern "C" int _except_handler4(void *, void *, void *, void *);
-static int (*real_except_handler4)(void *, void *, void *,
-                                   void *) = &_except_handler4;
-static int intercept_except_handler4(void *a, void *b, void *c, void *d) {
-  __asan_handle_no_return();
-  return real_except_handler4(a, b, c, d);
-}
-#  endif
-
-// Windows specific functions not included in asan_interface.inc.
-// INTERCEPT_WRAP_W_V(__asan_should_detect_stack_use_after_return)
-// INTERCEPT_WRAP_W_V(__asan_get_shadow_memory_dynamic_address)
-// INTERCEPT_WRAP_W_W(__asan_unhandled_exception_filter)
-
-extern "C" void __asan_initialize_static_thunk() {
-#  ifndef _WIN64
-  if (real_except_handler4 == &_except_handler4) {
-    // Single threaded, no need for synchronization.
-    if (!__sanitizer_override_function_by_addr(
-            reinterpret_cast<__sanitizer::uptr>(&intercept_except_handler4),
-            reinterpret_cast<__sanitizer::uptr>(&_except_handler4),
-            reinterpret_cast<__sanitizer::uptr*>(&real_except_handler4))) {
-      abort();
-    }
-  }
-#  endif
-}
-
-#endif  // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 9c1db7caeb7b7d..bda47bd7fd6a22 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -203,7 +203,7 @@ function(add_asan_tests arch test_runtime)
         CFLAGS ${ASAN_UNITTEST_INSTRUMENTED_CFLAGS} -D_MT -D_DLL
         SOURCES ${ASAN_INST_TEST_SOURCES}
         LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS}
-          -D_MT -D_DLL -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames
+          -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames
         )
     else()
 
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index ef23492514898b..45e51648917515 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -111,12 +111,6 @@ if(COMPILER_RT_TARGET_HAS_UNAME)
      -DCOMPILER_RT_HAS_UNAME=1)
 endif()
 
-if(MSVC)
-  # profile historically has only been supported with the static runtime
-  # on windows
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
-endif()
-
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
 # XRay uses C++ standard library headers.
diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 41c3888275a0f2..66f2d259aa5fd4 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -42,7 +42,6 @@ set(SANITIZER_SOURCES_NOTERMINATION
   sanitizer_thread_registry.cpp
   sanitizer_type_traits.cpp
   sanitizer_win.cpp
-  sanitizer_win_interception.cpp
   )
 
 set(SANITIZER_SOURCES
@@ -207,8 +206,8 @@ set(SANITIZER_IMPL_HEADERS
   sanitizer_vector.h
   sanitizer_win.h
   sanitizer_win_defs.h
-  sanitizer_win_interception.h
-  sanitizer_win_thunk_interception.h
+  sanitizer_win_dll_thunk.h
+  sanitizer_win_weak_interception.h
   )
 
 include_directories(..)
@@ -302,23 +301,57 @@ add_compiler_rt_object_libraries(RTSanitizerCommonSymbolizerNoHooks
   DEFS ${SANITIZER_COMMON_DEFINITIONS})
 
 if(WIN32)
-  set(RUNTIME_THUNK_CFLAGS -DSANITIZER_DYNAMIC_RUNTIME_THUNK -DSANITIZER_STATIC_RUNTIME_THUNK)
-  append_list_if(MSVC /Zl RUNTIME_THUNK_CFLAGS)
-  add_compiler_rt_object_libraries(SanitizerRuntimeThunk
+  add_compiler_rt_object_libraries(SanitizerCommonWeakInterception
     ${SANITIZER_COMMON_SUPPORTED_OS}
     ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
     SOURCES
-      sanitizer_win_thunk_interception.cpp
-    CFLAGS ${SANITIZER_CFLAGS} ${RUNTIME_THUNK_CFLAGS}
+      sanitizer_win_weak_interception.cpp
+    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DYNAMIC
+    DEFS ${SANITIZER_COMMON_DEFINITIONS})
+  add_compiler_rt_object_libraries(SancovWeakInterception
+    ${SANITIZER_COMMON_SUPPORTED_OS}
+    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+    SOURCES
+      sanitizer_coverage_win_weak_interception.cpp
+    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DYNAMIC
+    DEFS ${SANITIZER_COMMON_DEFINITIONS})
+
+  add_compiler_rt_object_libraries(SanitizerCommonDllThunk
+    ${SANITIZER_COMMON_SUPPORTED_OS}
+    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+    SOURCES
+      sanitizer_win_dll_thunk.cpp
+    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DLL_THUNK
+    DEFS ${SANITIZER_COMMON_DEFINITIONS})
+  add_compiler_rt_object_libraries(SancovDllThunk
+    ${SANITIZER_COMMON_SUPPORTED_OS}
+    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+    SOURCES
+      sanitizer_coverage_win_dll_thunk.cpp
+      sanitizer_coverage_win_sections.cpp
+    CFLAGS ${SANITIZER_CFLAGS} -DSANITIZER_DLL_THUNK
     DEFS ${SANITIZER_COMMON_DEFINITIONS})
 
-  add_compiler_rt_object_libraries(SancovRuntimeThunk
+  set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
+  if(MSVC)
+    list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
+  elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
+    list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
+  endif()
+  add_compiler_rt_object_libraries(SanitizerCommonDynamicRuntimeThunk
+    ${SANITIZER_COMMON_SUPPORTED_OS}
+    ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+    SOURCES
+      sanitizer_win_dynamic_runtime_thunk.cpp
+    CFLAGS ${SANITIZER_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
+    DEFS ${SANITIZER_COMMON_DEFINITIONS})
+  add_compiler_rt_object_libraries(SancovDynamicRuntimeThunk
     ${SANITIZER_COMMON_SUPPORTED_OS}
     ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
     SOURCES
-      sanitizer_coverage_win_runtime_thunk.cpp
+      sanitizer_coverage_win_dynamic_runtime_thunk.cpp
       sanitizer_coverage_win_sections.cpp
-    CFLAGS ${SANITIZER_CFLAGS} ${RUNTIME_THUNK_CFLAGS}
+    CFLAGS ${SANITIZER_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
     DEFS ${SANITIZER_COMMON_DEFINITIONS})
 endif()
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
index 11f1d963bd6f43..557207fe62ac6d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
@@ -50,9 +50,3 @@ INTERFACE_WEAK_FUNCTION(__sanitizer_malloc_hook)
 INTERFACE_FUNCTION(__sanitizer_internal_memcpy)
 INTERFACE_FUNCTION(__sanitizer_internal_memmove)
 INTERFACE_FUNCTION(__sanitizer_internal_memset)
-
-#if SANITIZER_WINDOWS
-INTERFACE_FUNCTION(__sanitizer_override_function)
-INTERFACE_FUNCTION(__sanitizer_override_function_by_addr)
-INTERFACE_FUNCTION(__sanitizer_register_weak_function)
-#endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
new file mode 100644
index 00000000000000..d0bf8a4556436c
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dll_thunk.cpp
@@ -0,0 +1,20 @@
+//===-- sanitizer_coverage_win_dll_thunk.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a family of thunks that should be statically linked into
+// the DLLs that have instrumentation in order to delegate the calls to the
+// shared runtime that lives in the main binary.
+// See https://github.com/google/sanitizers/issues/209 for the details.
+//===----------------------------------------------------------------------===//
+#ifdef SANITIZER_DLL_THUNK
+#include "sanitizer_win_dll_thunk.h"
+// Sanitizer Coverage interface functions.
+#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "sanitizer_coverage_interface.inc"
+#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp
similarity index 59%
rename from compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp
rename to compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp
index 281944643f216f..0bdf0c5aed418d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_runtime_thunk.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_dynamic_runtime_thunk.cpp
@@ -1,4 +1,4 @@
-//===-- sanitizer_coverage_win_runtime_thunk.cpp --------------------------===//
+//===-- sanitizer_coverage_win_dynamic_runtime_thunk.cpp ------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,20 +10,17 @@
 // to interact with Sanitizer Coverage, when it is included in a dll.
 //
 //===----------------------------------------------------------------------===//
-#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) || \
-    defined(SANITIZER_STATIC_RUNTIME_THUNK)
-#  define SANITIZER_IMPORT_INTERFACE 1
-#  include "sanitizer_win_defs.h"
-#  include "sanitizer_win_thunk_interception.h"
+#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
+#define SANITIZER_IMPORT_INTERFACE 1
+#include "sanitizer_win_defs.h"
 // Define weak alias for all weak functions imported from sanitizer coverage.
-#  define INTERFACE_FUNCTION(Name)
-#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
-#  include "sanitizer_coverage_interface.inc"
-#endif  // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
-        // defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
+#include "sanitizer_coverage_interface.inc"
+#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
 
 namespace __sanitizer {
 // Add one, otherwise unused, external symbol to this object file so that the
 // Visual C++ linker includes it and reads the .drective section.
 void ForceWholeArchiveIncludeForSanCov() {}
-}  // namespace __sanitizer
+}
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
new file mode 100644
index 00000000000000..55263981705fa6
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_win_weak_interception.cpp
@@ -0,0 +1,23 @@
+//===-- sanitizer_coverage_win_weak_interception.cpp ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This module should be included in Sanitizer Coverage when it implemented as a
+// shared library on Windows (dll), in order to delegate the calls of weak
+// functions to the implementation in the main executable when a strong
+// definition is provided.
+//===----------------------------------------------------------------------===//
+#ifdef SANITIZER_DYNAMIC
+#include "sanitizer_win_weak_interception.h"
+#include "sanitizer_interface_internal.h"
+#include "sancov_flags.h"
+// Check if strong definitions for weak functions are present in the main
+// executable. If that is the case, override dll functions to point to strong
+// implementations.
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "sanitizer_coverage_interface.inc"
+#endif // SANITIZER_DYNAMIC
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
new file mode 100644
index 00000000000000..1562c161a76260
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.cpp
@@ -0,0 +1,101 @@
+//===-- sanitizer_win_dll_thunk.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file defines a family of thunks that should be statically linked into
+// the DLLs that have instrumentation in order to delegate the calls to the
+// shared runtime that lives in the main binary.
+// See https://github.com/google/sanitizers/issues/209 for the details.
+//===----------------------------------------------------------------------===//
+
+#ifdef SANITIZER_DLL_THUNK
+#include "sanitizer_win_defs.h"
+#include "sanitizer_win_dll_thunk.h"
+#include "interception/interception.h"
+
+extern "C" {
+void *WINAPI GetModuleHandleA(const char *module_name);
+void abort();
+}
+
+namespace __sanitizer {
+uptr dllThunkGetRealAddrOrDie(const char *name) {
+  uptr ret =
+      __interception::InternalGetProcAddress((void *)GetModuleHandleA(0), name);
+  if (!ret)
+    abort();
+  return ret;
+}
+
+int dllThunkIntercept(const char* main_function, uptr dll_function) {
+  uptr wrapper = dllThunkGetRealAddrOrDie(main_function);
+  if (!__interception::OverrideFunction(dll_function, wrapper, 0))
+    abort();
+  return 0;
+}
+
+int dllThunkInterceptWhenPossible(const char* main_function,
+    const char* default_function, uptr dll_function) {
+  uptr wrapper = __interception::InternalGetProcAddress(
+    (void *)GetModuleHandleA(0), main_function);
+  if (!wrapper)
+    wrapper = dllThunkGetRealAddrOrDie(default_function);
+  if (!__interception::OverrideFunction(dll_function, wrapper, 0))
+    abort();
+  return 0;
+}
+} // namespace __sanitizer
+
+// Include Sanitizer Common interface.
+#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "sanitizer_common_interface.inc"
+
+#pragma section(".DLLTH$A", read)
+#pragma section(".DLLTH$Z", read)
+
+typedef void (*DllThunkCB)();
+extern "C" {
+__declspec(allocate(".DLLTH$A")) DllThunkCB __start_dll_thunk;
+__declspec(allocate(".DLLTH$Z")) DllThunkCB __stop_dll_thunk;
+}
+
+// Disable compiler warnings that show up if we declare our own version
+// of a compiler intrinsic (e.g. strlen).
+#pragma warning(disable: 4391)
+#pragma warning(disable: 4392)
+
+extern "C" int __dll_thunk_init() {
+  static bool flag = false;
+  // __dll_thunk_init is expected to be called by only one thread.
+  if (flag) return 0;
+  flag = true;
+
+  for (DllThunkCB *it = &__start_dll_thunk; it < &__stop_dll_thunk; ++it)
+    if (*it)
+      (*it)();
+
+  // In DLLs, the callbacks are expected to return 0,
+  // otherwise CRT initialization fails.
+  return 0;
+}
+
+// We want to call dll_thunk_init before C/C++ initializers / constructors are
+// executed, otherwise functions like memset might be invoked.
+#pragma section(".CRT$XIB", long, read)
+__declspec(allocate(".CRT$XIB")) int (*__dll_thunk_preinit)() =
+    __dll_thunk_init;
+
+static void WINAPI dll_thunk_thread_init(void *mod, unsigned long reason,
+                                         void *reserved) {
+  if (reason == /*DLL_PROCESS_ATTACH=*/1) __dll_thunk_init();
+}
+
+#pragma section(".CRT$XLAB", long, read)
+__declspec(allocate(".CRT$XLAB")) void (WINAPI *__dll_thunk_tls_init)(void *,
+    unsigned long, void *) = dll_thunk_thread_init;
+
+#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
new file mode 100644
index 00000000000000..639d91a2edaec4
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_dll_thunk.h
@@ -0,0 +1,181 @@
+//===-- sanitizer_win_dll_thunk.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This header provide helper macros to delegate calls to the shared runtime
+// that lives in the main executable. It should be included to dll_thunks that
+// will be linked to the dlls, when the sanitizer is a static library included
+// in the main executable.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_WIN_DLL_THUNK_H
+#define SANITIZER_WIN_DLL_THUNK_H
+#include "sanitizer_internal_defs.h"
+
+namespace __sanitizer {
+uptr dllThunkGetRealAddrOrDie(const char *name);
+
+int dllThunkIntercept(const char* main_function, uptr dll_function);
+
+int dllThunkInterceptWhenPossible(const char* main_function,
+    const char* default_function, uptr dll_function);
+}
+
+extern "C" int __dll_thunk_init();
+
+// ----------------- Function interception helper macros -------------------- //
+// Override dll_function with main_function from main executable.
+#define INTERCEPT_OR_DIE(main_function, dll_function)                          \
+  static int intercept_##dll_function() {                                      \
+    return __sanitizer::dllThunkIntercept(main_function, (__sanitizer::uptr)   \
+        dll_function);                                                         \
+  }                                                                            \
+  __pragma(section(".DLLTH$M", long, read))                                    \
+  __declspec(allocate(".DLLTH$M")) int (*__dll_thunk_##dll_function)() =       \
+    intercept_##dll_function;
+
+// Try to override dll_function with main_function from main executable.
+// If main_function is not present, override dll_function with default_function.
+#define INTERCEPT_WHEN_POSSIBLE(main_function, default_function, dll_function) \
+  static int intercept_##dll_function() {                                      \
+    return __sanitizer::dllThunkInterceptWhenPossible(main_function,           \
+        default_function, (__sanitizer::uptr)dll_function);                    \
+  }                                                                            \
+  __pragma(section(".DLLTH$M", long, read))                                    \
+  __declspec(allocate(".DLLTH$M")) int (*__dll_thunk_##dll_function)() =       \
+    intercept_##dll_function;
+
+// -------------------- Function interception macros ------------------------ //
+// Special case of hooks -- ASan own interface functions.  Those are only called
+// after __asan_init, thus an empty implementation is sufficient.
+#define INTERCEPT_SANITIZER_FUNCTION(name)                                     \
+  extern "C" __declspec(noinline) void name() {                                \
+    volatile int prevent_icf = (__LINE__ << 8) ^ __COUNTER__;                  \
+    static const char function_name[] = #name;                                 \
+    for (const char* ptr = &function_name[0]; *ptr; ++ptr)                     \
+      prevent_icf ^= *ptr;                                                     \
+    (void)prevent_icf;                                                         \
+    __debugbreak();                                                            \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name)
+
+// Special case of hooks -- Weak functions, could be redefined in the main
+// executable, but that is not necessary, so we shouldn't die if we can not find
+// a reference. Instead, when the function is not present in the main executable
+// we consider the default impl provided by asan library.
+#define INTERCEPT_SANITIZER_WEAK_FUNCTION(name)                                \
+  extern "C" __declspec(noinline) void name() {                                \
+    volatile int prevent_icf = (__LINE__ << 8) ^ __COUNTER__;                  \
+    static const char function_name[] = #name;                                 \
+    for (const char* ptr = &function_name[0]; *ptr; ++ptr)                     \
+      prevent_icf ^= *ptr;                                                     \
+    (void)prevent_icf;                                                         \
+    __debugbreak();                                                            \
+  }                                                                            \
+  INTERCEPT_WHEN_POSSIBLE(#name, STRINGIFY(WEAK_EXPORT_NAME(name)), name)
+
+// We can't define our own version of strlen etc. because that would lead to
+// link-time or even type mismatch errors.  Instead, we can declare a function
+// just to be able to get its address.  Me may miss the first few calls to the
+// functions since it can be called before __dll_thunk_init, but that would lead
+// to false negatives in the startup code before user's global initializers,
+// which isn't a big deal.
+#define INTERCEPT_LIBRARY_FUNCTION(name)                                       \
+  extern "C" void name();                                                      \
+  INTERCEPT_OR_DIE(STRINGIFY(WRAP(name)), name)
+
+// Use these macros for functions that could be called before __dll_thunk_init()
+// is executed and don't lead to errors if defined (free, malloc, etc).
+#define INTERCEPT_WRAP_V_V(name)                                               \
+  extern "C" void name() {                                                     \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    fn();                                                                      \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_V_W(name)                                               \
+  extern "C" void name(void *arg) {                                            \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    fn(arg);                                                                   \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_V_WW(name)                                              \
+  extern "C" void name(void *arg1, void *arg2) {                               \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    fn(arg1, arg2);                                                            \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_V_WWW(name)                                             \
+  extern "C" void name(void *arg1, void *arg2, void *arg3) {                   \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    fn(arg1, arg2, arg3);                                                      \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_V(name)                                               \
+  extern "C" void *name() {                                                    \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn();                                                               \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_W(name)                                               \
+  extern "C" void *name(void *arg) {                                           \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg);                                                            \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_WW(name)                                              \
+  extern "C" void *name(void *arg1, void *arg2) {                              \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg1, arg2);                                                     \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_WWW(name)                                             \
+  extern "C" void *name(void *arg1, void *arg2, void *arg3) {                  \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg1, arg2, arg3);                                               \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_WWWW(name)                                            \
+  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4) {      \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg1, arg2, arg3, arg4);                                         \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_WWWWW(name)                                           \
+  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4,        \
+                        void *arg5) {                                          \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg1, arg2, arg3, arg4, arg5);                                   \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#define INTERCEPT_WRAP_W_WWWWWW(name)                                          \
+  extern "C" void *name(void *arg1, void *arg2, void *arg3, void *arg4,        \
+                        void *arg5, void *arg6) {                              \
+    typedef decltype(name) *fntype;                                            \
+    static fntype fn = (fntype)__sanitizer::dllThunkGetRealAddrOrDie(#name);   \
+    return fn(arg1, arg2, arg3, arg4, arg5, arg6);                             \
+  }                                                                            \
+  INTERCEPT_OR_DIE(#name, name);
+
+#endif // SANITIZER_WIN_DLL_THUNK_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
new file mode 100644
index 00000000000000..87c032c6e61bc9
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_dynamic_runtime_thunk.cpp
@@ -0,0 +1,26 @@
+//===-- santizer_win_dynamic_runtime_thunk.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things that need to be present in the application modules
+// to interact with Sanitizer Common, when it is included in a dll.
+//
+//===----------------------------------------------------------------------===//
+#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
+#define SANITIZER_IMPORT_INTERFACE 1
+#include "sanitizer_win_defs.h"
+// Define weak alias for all weak functions imported from sanitizer common.
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
+#include "sanitizer_common_interface.inc"
+#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
+
+namespace __sanitizer {
+// Add one, otherwise unused, external symbol to this object file so that the
+// Visual C++ linker includes it and reads the .drective section.
+void ForceWholeArchiveIncludeForSanitizerCommon() {}
+}
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
deleted file mode 100644
index 808cd2f771fe1e..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_immortalize.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===-- sanitizer_win_immortalize.h ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is shared between AddressSanitizer, and interception.
-//
-// Windows-specific thread-safe and pre-CRT global initialization safe
-// infrastructure to create an object whose destructor is never called.
-//===----------------------------------------------------------------------===//
-#if SANITIZER_WINDOWS
-#  pragma once
-// Requires including sanitizer_placement_new.h (which is not allowed to be
-// included in headers).
-
-#  include "sanitizer_win_defs.h"
-// These types are required to satisfy XFG which requires that the names of the
-// types for indirect calls to be correct as well as the name of the original
-// type for any typedefs.
-
-// TODO: There must be a better way to do this
-#  ifndef _WINDOWS_
-typedef void* PVOID;
-typedef int BOOL;
-typedef union _RTL_RUN_ONCE {
-  PVOID ptr;
-} INIT_ONCE, *PINIT_ONCE;
-
-extern "C" {
-__declspec(dllimport) int WINAPI InitOnceExecuteOnce(
-    PINIT_ONCE, BOOL(WINAPI*)(PINIT_ONCE, PVOID, PVOID*), void*, void*);
-}
-#  endif
-
-namespace __sanitizer {
-template <class Ty>
-BOOL WINAPI immortalize_impl(PINIT_ONCE, PVOID storage_ptr, PVOID*) noexcept {
-  // Ty must provide a placement new operator
-  new (storage_ptr) Ty();
-  return 1;
-}
-
-template <class Ty, typename Arg>
-BOOL WINAPI immortalize_impl(PINIT_ONCE, PVOID storage_ptr,
-                             PVOID* param) noexcept {
-  // Ty must provide a placement new operator
-  new (storage_ptr) Ty(*((Arg*)param));
-  return 1;
-}
-
-template <class Ty>
-Ty& immortalize() {  // return a reference to an object that will live forever
-  static INIT_ONCE flag;
-  alignas(Ty) static unsigned char storage[sizeof(Ty)];
-  InitOnceExecuteOnce(&flag, immortalize_impl<Ty>, &storage, nullptr);
-  return reinterpret_cast<Ty&>(storage);
-}
-
-template <class Ty, typename Arg>
-Ty& immortalize(
-    Arg arg) {  // return a reference to an object that will live forever
-  static INIT_ONCE flag;
-  alignas(Ty) static unsigned char storage[sizeof(Ty)];
-  InitOnceExecuteOnce(&flag, immortalize_impl<Ty, Arg>, &storage, &arg);
-  return reinterpret_cast<Ty&>(storage);
-}
-}  // namespace __sanitizer
-#endif  // SANITIZER_WINDOWS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
deleted file mode 100644
index 75a1545d00d8b5..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===-- sanitizer_win_interception.cpp --------------------    --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Windows-specific export surface to provide interception for parts of the
-// runtime that are always statically linked, both for overriding user-defined
-// functions as well as registering weak functions that the ASAN runtime should
-// use over defaults.
-//
-//===----------------------------------------------------------------------===//
-
-#include "sanitizer_platform.h"
-#if SANITIZER_WINDOWS
-#  include <stddef.h>
-
-#  include "interception/interception.h"
-#  include "sanitizer_addrhashmap.h"
-#  include "sanitizer_common.h"
-#  include "sanitizer_internal_defs.h"
-#  include "sanitizer_placement_new.h"
-#  include "sanitizer_win_immortalize.h"
-#  include "sanitizer_win_interception.h"
-
-using namespace __sanitizer;
-
-extern "C" void *__ImageBase;
-
-namespace __sanitizer {
-
-static uptr GetSanitizerDllExport(const char *export_name) {
-  const uptr function_address =
-      __interception::InternalGetProcAddress(&__ImageBase, export_name);
-  if (function_address == 0) {
-    Report("ERROR: Failed to find sanitizer DLL export '%s'\n", export_name);
-    CHECK("Failed to find sanitizer DLL export" && 0);
-  }
-  return function_address;
-}
-
-struct WeakCallbackList {
-  explicit constexpr WeakCallbackList(RegisterWeakFunctionCallback cb)
-      : callback(cb), next(nullptr) {}
-
-  static void *operator new(size_t size) { return InternalAlloc(size); }
-
-  static void operator delete(void *p) { InternalFree(p); }
-
-  RegisterWeakFunctionCallback callback;
-  WeakCallbackList *next;
-};
-using WeakCallbackMap = AddrHashMap<WeakCallbackList *, 11>;
-
-static WeakCallbackMap *GetWeakCallbackMap() {
-  return &immortalize<WeakCallbackMap>();
-}
-
-void AddRegisterWeakFunctionCallback(uptr export_address,
-                                     RegisterWeakFunctionCallback cb) {
-  WeakCallbackMap::Handle h_find_or_create(GetWeakCallbackMap(), export_address,
-                                           false, true);
-  CHECK(h_find_or_create.exists());
-  if (h_find_or_create.created()) {
-    *h_find_or_create = new WeakCallbackList(cb);
-  } else {
-    (*h_find_or_create)->next = new WeakCallbackList(cb);
-  }
-}
-
-static void RunWeakFunctionCallbacks(uptr export_address) {
-  WeakCallbackMap::Handle h_find(GetWeakCallbackMap(), export_address, false,
-                                 false);
-  if (!h_find.exists()) {
-    return;
-  }
-
-  WeakCallbackList *list = *h_find;
-  do {
-    list->callback();
-  } while ((list = list->next));
-}
-
-}  // namespace __sanitizer
-
-extern "C" __declspec(dllexport) bool __cdecl __sanitizer_override_function(
-    const char *export_name, const uptr user_function,
-    uptr *const old_user_function) {
-  CHECK(export_name);
-  CHECK(user_function);
-
-  const uptr sanitizer_function = GetSanitizerDllExport(export_name);
-
-  const bool function_overridden = __interception::OverrideFunction(
-      user_function, sanitizer_function, old_user_function);
-  if (!function_overridden) {
-    Report(
-        "ERROR: Failed to override local function at '%p' with sanitizer "
-        "function '%s'\n",
-        user_function, export_name);
-    CHECK("Failed to replace local function with sanitizer version." && 0);
-  }
-
-  return function_overridden;
-}
-
-extern "C"
-    __declspec(dllexport) bool __cdecl __sanitizer_override_function_by_addr(
-        const uptr source_function, const uptr target_function,
-        uptr *const old_target_function) {
-  CHECK(source_function);
-  CHECK(target_function);
-
-  const bool function_overridden = __interception::OverrideFunction(
-      target_function, source_function, old_target_function);
-  if (!function_overridden) {
-    Report(
-        "ERROR: Failed to override function at '%p' with function at "
-        "'%p'\n",
-        target_function, source_function);
-    CHECK("Failed to apply function override." && 0);
-  }
-
-  return function_overridden;
-}
-
-extern "C"
-    __declspec(dllexport) bool __cdecl __sanitizer_register_weak_function(
-        const char *export_name, const uptr user_function,
-        uptr *const old_user_function) {
-  CHECK(export_name);
-  CHECK(user_function);
-
-  const uptr sanitizer_function = GetSanitizerDllExport(export_name);
-
-  const bool function_overridden = __interception::OverrideFunction(
-      sanitizer_function, user_function, old_user_function);
-  if (!function_overridden) {
-    Report(
-        "ERROR: Failed to register local function at '%p' to be used in "
-        "place of sanitizer function '%s'\n.",
-        user_function, export_name);
-    CHECK("Failed to register weak function." && 0);
-  }
-
-  // Note that thread-safety of RunWeakFunctionCallbacks in InitializeFlags
-  // depends on __sanitizer_register_weak_functions being called during the
-  // loader lock.
-  RunWeakFunctionCallbacks(sanitizer_function);
-
-  return function_overridden;
-}
-
-#endif  // SANITIZER_WINDOWS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
deleted file mode 100644
index 70ae3d6bf31f2a..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_interception.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- sanitizer_win_interception.h ----------------------    --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Windows-specific export surface to provide interception for parts of the
-// runtime that are always statically linked, both for overriding user-defined
-// functions as well as registering weak functions that the ASAN runtime should
-// use over defaults.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_WIN_INTERCEPTION_H
-#define SANITIZER_WIN_INTERCEPTION_H
-
-#include "sanitizer_platform.h"
-#if SANITIZER_WINDOWS
-
-#  include "sanitizer_common.h"
-#  include "sanitizer_internal_defs.h"
-
-namespace __sanitizer {
-using RegisterWeakFunctionCallback = void (*)();
-void AddRegisterWeakFunctionCallback(uptr export_address,
-                                     RegisterWeakFunctionCallback cb);
-}  // namespace __sanitizer
-
-#endif  // SANITIZER_WINDOWS
-#endif  // SANITIZER_WIN_INTERCEPTION_H
\ No newline at end of file
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
deleted file mode 100644
index 13db8869abadd5..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- sanitizer_win_thunk_interception.cpp -----------------------  -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines things that need to be present in the application modules
-// to interact with sanitizer DLL correctly and cannot be implemented using the
-// default "import library" generated when linking the DLL.
-//
-// This includes the common infrastructure required to intercept local functions
-// that must be replaced with sanitizer-aware versions, as well as the
-// registration of weak functions with the sanitizer DLL. With this in-place,
-// other sanitizer components can simply write to the .INTR and .WEAK sections.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(SANITIZER_STATIC_RUNTIME_THUNK) || \
-    defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
-#  include "sanitizer_win_thunk_interception.h"
-
-extern "C" void abort();
-
-namespace __sanitizer {
-
-int override_function(const char *export_name, const uptr user_function) {
-  if (!__sanitizer_override_function(export_name, user_function)) {
-    abort();
-  }
-
-  return 0;
-}
-
-int register_weak(const char *export_name, const uptr user_function) {
-  if (!__sanitizer_register_weak_function(export_name, user_function)) {
-    abort();
-  }
-
-  return 0;
-}
-
-void initialize_thunks(const sanitizer_thunk *first,
-                       const sanitizer_thunk *last) {
-  for (const sanitizer_thunk *it = first; it < last; ++it) {
-    if (*it) {
-      (*it)();
-    }
-  }
-}
-}  // namespace __sanitizer
-
-#  define INTERFACE_FUNCTION(Name)
-#  define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
-#  include "sanitizer_common_interface.inc"
-
-#  pragma section(".INTR$A", read)  // intercept begin
-#  pragma section(".INTR$Z", read)  // intercept end
-#  pragma section(".WEAK$A", read)  // weak begin
-#  pragma section(".WEAK$Z", read)  // weak end
-
-extern "C" {
-__declspec(allocate(
-    ".INTR$A")) sanitizer_thunk __sanitizer_intercept_thunk_begin;
-__declspec(allocate(".INTR$Z")) sanitizer_thunk __sanitizer_intercept_thunk_end;
-
-__declspec(allocate(
-    ".WEAK$A")) sanitizer_thunk __sanitizer_register_weak_thunk_begin;
-__declspec(allocate(
-    ".WEAK$Z")) sanitizer_thunk __sanitizer_register_weak_thunk_end;
-}
-
-extern "C" int __sanitizer_thunk_init() {
-  // __sanitizer_static_thunk_init is expected to be called by only one thread.
-  static bool flag = false;
-  if (flag) {
-    return 0;
-  }
-  flag = true;
-
-  __sanitizer::initialize_thunks(&__sanitizer_intercept_thunk_begin,
-                                 &__sanitizer_intercept_thunk_end);
-  __sanitizer::initialize_thunks(&__sanitizer_register_weak_thunk_begin,
-                                 &__sanitizer_register_weak_thunk_end);
-
-  // In DLLs, the callbacks are expected to return 0,
-  // otherwise CRT initialization fails.
-  return 0;
-}
-
-// We want to call dll_thunk_init before C/C++ initializers / constructors are
-// executed, otherwise functions like memset might be invoked.
-#  pragma section(".CRT$XIB", long, read)
-__declspec(allocate(".CRT$XIB")) int (*__sanitizer_thunk_init_ptr)() =
-    __sanitizer_thunk_init;
-
-static void WINAPI sanitizer_thunk_thread_init(void *mod, unsigned long reason,
-                                               void *reserved) {
-  if (reason == /*DLL_PROCESS_ATTACH=*/1)
-    __sanitizer_thunk_init();
-}
-
-#  pragma section(".CRT$XLAB", long, read)
-__declspec(allocate(".CRT$XLAB")) void(
-    WINAPI *__sanitizer_thunk_thread_init_ptr)(void *, unsigned long, void *) =
-    sanitizer_thunk_thread_init;
-
-#endif  // defined(SANITIZER_STATIC_RUNTIME_THUNK) ||
-        // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
\ No newline at end of file
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
deleted file mode 100644
index 70177d68aa8e65..00000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win_thunk_interception.h
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- sanitizer_win_thunk_interception.h -------------------------  -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This header provide helper macros and functions to delegate calls to the
-// shared runtime that lives in the sanitizer DLL.
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_WIN_THUNK_INTERCEPTION_H
-#define SANITIZER_WIN_THUNK_INTERCEPTION_H
-#include <stdint.h>
-
-#include "sanitizer_internal_defs.h"
-
-extern "C" {
-__declspec(dllimport) bool __cdecl __sanitizer_override_function(
-    const char *export_name, __sanitizer::uptr user_function,
-    __sanitizer::uptr *old_function = nullptr);
-__declspec(dllimport) bool __cdecl __sanitizer_override_function_by_addr(
-    __sanitizer::uptr source_function, __sanitizer::uptr target_function,
-    __sanitizer::uptr *old_target_function = nullptr);
-__declspec(dllimport) bool __cdecl __sanitizer_register_weak_function(
-    const char *export_name, __sanitizer::uptr user_function,
-    __sanitizer::uptr *old_function = nullptr);
-}
-
-using sanitizer_thunk = int (*)();
-
-namespace __sanitizer {
-int override_function(const char *export_name, uptr user_function);
-int register_weak(const char *export_name, uptr user_function);
-void initialize_thunks(const sanitizer_thunk *begin,
-                       const sanitizer_thunk *end);
-}  // namespace __sanitizer
-
-// -------------------- Function interception macros ------------------------ //
-// We can't define our own version of strlen etc. because that would lead to
-// link-time or even type mismatch errors.  Instead, we can declare a function
-// just to be able to get its address.  Me may miss the first few calls to the
-// functions since it can be called before __dll_thunk_init, but that would lead
-// to false negatives in the startup code before user's global initializers,
-// which isn't a big deal.
-// Use .INTR segment to register function pointers that are iterated over during
-// startup that will replace local_function with sanitizer_export.
-
-#define INTERCEPT_LIBRARY_FUNCTION(local_function, sanitizer_export)   \
-  extern "C" void local_function();                                    \
-  static int intercept_##local_function() {                            \
-    return __sanitizer::override_function(                             \
-        sanitizer_export,                                              \
-        reinterpret_cast<__sanitizer::uptr>(local_function));          \
-  }                                                                    \
-  __pragma(section(".INTR$M", long, read)) __declspec(allocate(        \
-      ".INTR$M")) int (*__sanitizer_static_thunk_##local_function)() = \
-      intercept_##local_function;
-
-// ------------------ Weak symbol registration macros ---------------------- //
-// Use .WEAK segment to register function pointers that are iterated over during
-// startup that will replace sanitizer_export with local_function
-
-#define REGISTER_WEAK_FUNCTION(local_function)                           \
-  extern "C" void local_function();                                      \
-  extern "C" void WEAK_EXPORT_NAME(local_function)();                    \
-  WIN_WEAK_IMPORT_DEF(local_function)                                    \
-  __attribute__((optnone)) static int register_weak_##local_function() { \
-    if ((uintptr_t) & local_function != (uintptr_t) &                    \
-        WEAK_EXPORT_NAME(local_function)) {                              \
-      return __sanitizer::register_weak(                                 \
-          SANITIZER_STRINGIFY(WEAK_EXPORT_NAME(local_function)),         \
-          reinterpret_cast<__sanitizer::uptr>(local_function));          \
-    }                                                                    \
-    return 0;                                                            \
-  }                                                                      \
-  __pragma(section(".WEAK$M", long, read)) __declspec(allocate(          \
-      ".WEAK$M")) int (*__sanitizer_register_weak_##local_function)() =  \
-      register_weak_##local_function;
-
-#endif  // SANITIZER_WIN_STATIC_RUNTIME_THUNK_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
new file mode 100644
index 00000000000000..b14bbf76d9a765
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.cpp
@@ -0,0 +1,94 @@
+//===-- sanitizer_win_weak_interception.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This module should be included in the sanitizer when it is implemented as a
+// shared library on Windows (dll), in order to delegate the calls of weak
+// functions to the implementation in the main executable when a strong
+// definition is provided.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_platform.h"
+#if SANITIZER_WINDOWS && SANITIZER_DYNAMIC
+#include "sanitizer_win_weak_interception.h"
+#include "sanitizer_allocator_interface.h"
+#include "sanitizer_interface_internal.h"
+#include "sanitizer_win_defs.h"
+#include "interception/interception.h"
+
+extern "C" {
+void *WINAPI GetModuleHandleA(const char *module_name);
+void abort();
+}
+
+namespace __sanitizer {
+// Try to get a pointer to real_function in the main module and override
+// dll_function with that pointer. If the function isn't found, nothing changes.
+int interceptWhenPossible(uptr dll_function, const char *real_function) {
+  uptr real = __interception::InternalGetProcAddress(
+      (void *)GetModuleHandleA(0), real_function);
+  if (real && !__interception::OverrideFunction((uptr)dll_function, real, 0))
+    abort();
+  return 0;
+}
+} // namespace __sanitizer
+
+// Declare weak hooks.
+extern "C" {
+void __sanitizer_on_print(const char *str);
+void __sanitizer_weak_hook_memcmp(uptr called_pc, const void *s1,
+                                  const void *s2, uptr n, int result);
+void __sanitizer_weak_hook_strcmp(uptr called_pc, const char *s1,
+                                  const char *s2, int result);
+void __sanitizer_weak_hook_strncmp(uptr called_pc, const char *s1,
+                                   const char *s2, uptr n, int result);
+void __sanitizer_weak_hook_strstr(uptr called_pc, const char *s1,
+                                  const char *s2, char *result);
+}
+
+// Include Sanitizer Common interface.
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "sanitizer_common_interface.inc"
+
+#pragma section(".WEAK$A", read)
+#pragma section(".WEAK$Z", read)
+
+typedef void (*InterceptCB)();
+extern "C" {
+__declspec(allocate(".WEAK$A")) InterceptCB __start_weak_list;
+__declspec(allocate(".WEAK$Z")) InterceptCB __stop_weak_list;
+}
+
+static int weak_intercept_init() {
+  static bool flag = false;
+  // weak_interception_init is expected to be called by only one thread.
+  if (flag) return 0;
+  flag = true;
+
+  for (InterceptCB *it = &__start_weak_list; it < &__stop_weak_list; ++it)
+    if (*it)
+      (*it)();
+
+  // In DLLs, the callbacks are expected to return 0,
+  // otherwise CRT initialization fails.
+  return 0;
+}
+
+#pragma section(".CRT$XIB", long, read)
+__declspec(allocate(".CRT$XIB")) int (*__weak_intercept_preinit)() =
+    weak_intercept_init;
+
+static void WINAPI weak_intercept_thread_init(void *mod, unsigned long reason,
+                                              void *reserved) {
+  if (reason == /*DLL_PROCESS_ATTACH=*/1) weak_intercept_init();
+}
+
+#pragma section(".CRT$XLAB", long, read)
+__declspec(allocate(".CRT$XLAB")) void(WINAPI *__weak_intercept_tls_init)(
+    void *, unsigned long, void *) = weak_intercept_thread_init;
+
+#endif // SANITIZER_WINDOWS && SANITIZER_DYNAMIC
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
new file mode 100644
index 00000000000000..5e4d8b8def3e7d
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win_weak_interception.h
@@ -0,0 +1,32 @@
+//===-- sanitizer_win_weak_interception.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This header provide helper macros to delegate calls of weak functions to the
+// implementation in the main executable when a strong definition is present.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_WIN_WEAK_INTERCEPTION_H
+#define SANITIZER_WIN_WEAK_INTERCEPTION_H
+#include "sanitizer_internal_defs.h"
+
+namespace __sanitizer {
+int interceptWhenPossible(uptr dll_function, const char *real_function);
+}
+
+// ----------------- Function interception helper macros -------------------- //
+// Weak functions, could be redefined in the main executable, but that is not
+// necessary, so we shouldn't die if we can not find a reference.
+#define INTERCEPT_WEAK(Name) interceptWhenPossible((uptr) Name, #Name);
+
+#define INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)                                \
+  static int intercept_##Name() {                                              \
+    return __sanitizer::interceptWhenPossible((__sanitizer::uptr) Name, #Name);\
+  }                                                                            \
+  __pragma(section(".WEAK$M", long, read))                                     \
+  __declspec(allocate(".WEAK$M")) int (*__weak_intercept_##Name)() =           \
+      intercept_##Name;
+
+#endif // SANITIZER_WIN_WEAK_INTERCEPTION_H
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
index 5d45a53d02dbd3..db0b33f1276ef2 100644
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -159,12 +159,33 @@ else()
     CFLAGS ${UBSAN_CXXFLAGS})
 
   if (WIN32)
-    set(RUNTIME_THUNK_CFLAGS -DSANITIZER_DYNAMIC_RUNTIME_THUNK -DSANITIZER_STATIC_RUNTIME_THUNK)
-    add_compiler_rt_object_libraries(UbsanRuntimeThunk
+    add_compiler_rt_object_libraries(UbsanWeakInterception
       ${SANITIZER_COMMON_SUPPORTED_OS}
       ARCHS ${UBSAN_SUPPORTED_ARCH}
       SOURCES
-        ubsan_win_runtime_thunk.cpp
+        ubsan_win_weak_interception.cpp
+      CFLAGS ${UBSAN_CFLAGS} -DSANITIZER_DYNAMIC
+      DEFS ${UBSAN_COMMON_DEFINITIONS})
+
+    add_compiler_rt_object_libraries(UbsanDllThunk
+      ${SANITIZER_COMMON_SUPPORTED_OS}
+      ARCHS ${UBSAN_SUPPORTED_ARCH}
+      SOURCES
+        ubsan_win_dll_thunk.cpp
+      CFLAGS ${UBSAN_CFLAGS} -DSANITIZER_DLL_THUNK
+      DEFS ${UBSAN_COMMON_DEFINITIONS})
+
+    set(DYNAMIC_RUNTIME_THUNK_CFLAGS "-DSANITIZER_DYNAMIC_RUNTIME_THUNK")
+    if(MSVC)
+      list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-Zl")
+    elseif(CMAKE_C_COMPILER_ID MATCHES Clang)
+      list(APPEND DYNAMIC_RUNTIME_THUNK_CFLAGS "-nodefaultlibs")
+    endif()
+    add_compiler_rt_object_libraries(UbsanDynamicRuntimeThunk
+      ${SANITIZER_COMMON_SUPPORTED_OS}
+      ARCHS ${UBSAN_SUPPORTED_ARCH}
+      SOURCES
+        ubsan_win_dynamic_runtime_thunk.cpp
       CFLAGS ${UBSAN_CFLAGS} ${DYNAMIC_RUNTIME_THUNK_CFLAGS}
       DEFS ${UBSAN_COMMON_DEFINITIONS})
   endif()
diff --git a/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp b/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
new file mode 100644
index 00000000000000..5ac7fc3e08e4c7
--- /dev/null
+++ b/compiler-rt/lib/ubsan/ubsan_win_dll_thunk.cpp
@@ -0,0 +1,20 @@
+//===-- ubsan_win_dll_thunk.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a family of thunks that should be statically linked into
+// the DLLs that have instrumentation in order to delegate the calls to the
+// shared runtime that lives in the main binary.
+// See https://github.com/google/sanitizers/issues/209 for the details.
+//===----------------------------------------------------------------------===//
+#ifdef SANITIZER_DLL_THUNK
+#include "sanitizer_common/sanitizer_win_dll_thunk.h"
+// Ubsan interface functions.
+#define INTERFACE_FUNCTION(Name) INTERCEPT_SANITIZER_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "ubsan_interface.inc"
+#endif // SANITIZER_DLL_THUNK
diff --git a/compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp b/compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp
similarity index 62%
rename from compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp
rename to compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp
index 5ca7d6f385cf27..00722b4033a53f 100644
--- a/compiler-rt/lib/ubsan/ubsan_win_runtime_thunk.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_win_dynamic_runtime_thunk.cpp
@@ -1,4 +1,4 @@
-//===-- ubsan_win_runtime_thunk.cpp -----------------------------        --===//
+//===-- ubsan_win_dynamic_runtime_thunk.cpp -------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,11 @@
 // to interact with Ubsan, when it is included in a dll.
 //
 //===----------------------------------------------------------------------===//
-#if defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||                                \
-    defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#ifdef SANITIZER_DYNAMIC_RUNTIME_THUNK
 #define SANITIZER_IMPORT_INTERFACE 1
 #include "sanitizer_common/sanitizer_win_defs.h"
-#include "sanitizer_common/sanitizer_win_thunk_interception.h"
 // Define weak alias for all weak functions imported from ubsan.
 #define INTERFACE_FUNCTION(Name)
-#define INTERFACE_WEAK_FUNCTION(Name) REGISTER_WEAK_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) WIN_WEAK_IMPORT_DEF(Name)
 #include "ubsan_interface.inc"
-#endif // defined(SANITIZER_DYNAMIC_RUNTIME_THUNK) ||
-       // defined(SANITIZER_STATIC_RUNTIME_THUNK)
+#endif // SANITIZER_DYNAMIC_RUNTIME_THUNK
diff --git a/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp b/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp
new file mode 100644
index 00000000000000..01db0c0ce78abe
--- /dev/null
+++ b/compiler-rt/lib/ubsan/ubsan_win_weak_interception.cpp
@@ -0,0 +1,23 @@
+//===-- ubsan_win_weak_interception.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This module should be included in Ubsan when it is implemented as a shared
+// library on Windows (dll), in order to delegate the calls of weak functions to
+// the implementation in the main executable when a strong definition is
+// provided.
+//===----------------------------------------------------------------------===//
+#ifdef SANITIZER_DYNAMIC
+#include "sanitizer_common/sanitizer_win_weak_interception.h"
+#include "ubsan_flags.h"
+#include "ubsan_monitor.h"
+// Check if strong definitions for weak functions are present in the main
+// executable. If that is the case, override dll functions to point to strong
+// implementations.
+#define INTERFACE_FUNCTION(Name)
+#define INTERFACE_WEAK_FUNCTION(Name) INTERCEPT_SANITIZER_WEAK_FUNCTION(Name)
+#include "ubsan_interface.inc"
+#endif // SANITIZER_DYNAMIC
diff --git a/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp b/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
index 59dca32672901a..2cedbc722c4635 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/interface_symbols_darwin.cpp
@@ -35,9 +35,6 @@
 // RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc     \
 // RUN:  | grep -e "INTERFACE_\(WEAK_\)\?FUNCTION"                                \
 // RUN:  | grep -v "__sanitizer_weak_hook"                                        \
-// RUN:  | grep -v "__sanitizer_override_function"                                \
-// RUN:  | grep -v "__sanitizer_override_function_by_addr"                        \
-// RUN:  | grep -v "__sanitizer_register_weak_function"                           \
 // RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports
 //
 // RUN: cat %t.imports | sort | uniq > %t.imports-sorted
diff --git a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
index 2d729497548d90..ce1255c9578317 100644
--- a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
@@ -21,9 +21,6 @@
 // RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc      \
 // RUN:  | grep -e "INTERFACE_\(WEAK_\)\?FUNCTION"                                 \
 // RUN:  | grep -v "__sanitizer_weak_hook"                                         \
-// RUN:  | grep -v "__sanitizer_override_function"                                 \
-// RUN:  | grep -v "__sanitizer_override_function_by_addr"                         \
-// RUN:  | grep -v "__sanitizer_register_weak_function"                            \
 // RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports
 //
 // RUN: cat %t.imports | sort | uniq > %t.imports-sorted
diff --git a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
index 71c45e7e889a22..e288b40fac47a3 100644
--- a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
@@ -9,13 +9,13 @@ int main() {
   free(x);
   // CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-3]]
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-3]]
   // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
   // CHECK-LABEL: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-8]]
   // CHECK-LABEL: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-3] .* main .*double_free.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-12]]
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
index 297218bf8e99f1..11e8c9975cf3bf 100644
--- a/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/free_hook_realloc.cpp
@@ -5,6 +5,9 @@
 // FIXME: merge this with the common free_hook_realloc test when we can run
 // common tests on Windows.
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 #include <stdlib.h>
 #include <io.h>
 #include <sanitizer/allocator_interface.h>
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
index e5de2269ffee04..7ea95d2b2184a0 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-3] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
index 6007345755d88e..1495632456e081 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-3] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
index 59a944c75b60db..d1eac7e55f6010 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-3] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc }}
-  // CHECK: {{ #[1-3] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
index 175bdefa7c995d..96fae6b1d60392 100644
--- a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_and_lib.cpp
@@ -1,9 +1,10 @@
 // Just make sure we can link an implib into another DLL
 // This used to fail between r212699 and r212814.
 // RUN: %clang_cl_asan -DCONFIG=1 %s -c -Fo%t.1.obj
-// RUN: lld-link /nologo /DLL /OUT:%t.1.dll %t.1.obj %asan_lib %asan_thunk
+// RUN: lld-link /nologo /DLL /OUT:%t.1.dll %t.1.obj %asan_dll_thunk
 // RUN: %clang_cl_asan -DCONFIG=2 %s -c -Fo%t.2.obj
-// RUN: lld-link /nologo /DLL /OUT:%t.2.dll %t.2.obj %t.1.lib %asan_lib %asan_thunk
+// RUN: lld-link /nologo /DLL /OUT:%t.2.dll %t.2.obj %t.1.lib %asan_dll_thunk
+// REQUIRES: asan-static-runtime
 // REQUIRES: lld-available
 
 #if CONFIG==1
diff --git a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
index f0c3deabbcf970..788488dbb8ed82 100644
--- a/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/msvc/dll_large_function.cpp
@@ -3,7 +3,8 @@
 // from the DLL.  We simulate the large function with
 // -mllvm -asan-instrumentation-with-call-threshold=0.
 // RUN: %clang_cl_asan %s -c -Fo%t.obj -mllvm -asan-instrumentation-with-call-threshold=0
-// RUN: lld-link /nologo /DLL /OUT:%t.dll %t.obj  %asan_lib %asan_thunk
+// RUN: lld-link /nologo /DLL /OUT:%t.dll %t.obj %asan_dll_thunk
+// REQUIRES: asan-static-runtime
 // REQUIRES: lld-available
 
 void f(long* foo, long* bar) {
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
index f1fd139c582511..ebde5f159ae383 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-3] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
index ea674f53def793..281efed5d30740 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-3] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
index 7d9c41ef0f4621..6ff2217b11a257 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-3] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-3] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
index 05437abc07c829..be99c89e7083ef 100644
--- a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
@@ -17,6 +17,6 @@ int main() {
   // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* malloc}}
-  // CHECK: {{ #[1-3] .* main .*symbols_path.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-2] .* main .*symbols_path.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp b/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
index 00428b809fccd7..0eb1e9ee91b0a7 100644
--- a/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/unsymbolized.cpp
@@ -4,7 +4,7 @@
 
 // RUN: rm -f %t.pdb
 // RUN: %clangxx_asan -c -O2 %s -o %t.obj
-// RUN: lld-link /nologo /OUT:%t.exe %t.obj -defaultlib:libcmt -nodefaultlib:msvcrt -defaultlib:oldnames %asan_static_runtime_thunk %asan_lib
+// RUN: lld-link /nologo /OUT:%t.exe %t.obj %asan_lib %asan_cxx_lib
 // RUN: not %run %t.exe 2>&1 | FileCheck %s
 // REQUIRES: lld-available
 
diff --git a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
index 35947b3253857c..4c32c63c38fa1f 100644
--- a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
@@ -15,9 +15,9 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-3] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* realloc }}
-  // CHECK: {{ #[1-3] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/debug_double_free.cpp b/compiler-rt/test/asan/TestCases/debug_double_free.cpp
index 8a2ce40bc561f6..de5ac7b0c8d5cd 100644
--- a/compiler-rt/test/asan/TestCases/debug_double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/debug_double_free.cpp
@@ -4,6 +4,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 // If we use %p with MSVC, it comes out all upper case. Use %08x to get
 // lowercase hex.
 #ifdef _MSC_VER
diff --git a/compiler-rt/test/asan/TestCases/debug_report.cpp b/compiler-rt/test/asan/TestCases/debug_report.cpp
index 855642bdc0d3bb..617b7ee91e18d7 100644
--- a/compiler-rt/test/asan/TestCases/debug_report.cpp
+++ b/compiler-rt/test/asan/TestCases/debug_report.cpp
@@ -6,6 +6,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 int main() {
   // Disable stderr buffering. Needed on Windows.
   setvbuf(stderr, NULL, _IONBF, 0);
diff --git a/compiler-rt/test/asan/TestCases/default_options.cpp b/compiler-rt/test/asan/TestCases/default_options.cpp
index 845e8a5f1793e4..526dab6450e9bd 100644
--- a/compiler-rt/test/asan/TestCases/default_options.cpp
+++ b/compiler-rt/test/asan/TestCases/default_options.cpp
@@ -1,7 +1,11 @@
 // RUN: %clangxx_asan -O2 %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 const char *kAsanDefaultOptions = "verbosity=1 help=1";
+
 // Required for dyld macOS 12.0+
 #if (__APPLE__)
 __attribute__((weak))
diff --git a/compiler-rt/test/asan/TestCases/on_error_callback.cpp b/compiler-rt/test/asan/TestCases/on_error_callback.cpp
index c38a36f0e33bda..f65a8f1abe8310 100644
--- a/compiler-rt/test/asan/TestCases/on_error_callback.cpp
+++ b/compiler-rt/test/asan/TestCases/on_error_callback.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/compiler-rt/test/asan/TestCases/report_error_summary.cpp b/compiler-rt/test/asan/TestCases/report_error_summary.cpp
index 9e024e35bed864..d565d2add77934 100644
--- a/compiler-rt/test/asan/TestCases/report_error_summary.cpp
+++ b/compiler-rt/test/asan/TestCases/report_error_summary.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
+// FIXME: Doesn't work with DLLs
+// XFAIL: win32-dynamic-asan
+
 #include <stdio.h>
 
 // Required for ld64 macOS 12.0+
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 05ed7e8dd294e3..83b3cbe789cacc 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -130,11 +130,6 @@ def build_invocation(compile_flags, with_lto=False):
             config.compiler_rt_libdir,
             "libclang_rt.asan_{}_dynamic.dylib".format(config.apple_platform),
         )
-    elif config.host_os == "Windows":
-        shared_libasan_path = os.path.join(
-            config.compiler_rt_libdir,
-            "clang_rt.asan_dynamic-{}.lib".format(config.target_suffix),
-        )
     else:
         lit_config.warning(
             "%shared_libasan substitution not set but dynamic ASan is available."
@@ -183,22 +178,8 @@ def build_invocation(compile_flags, with_lto=False):
         base_lib = os.path.join(
             config.compiler_rt_libdir, "clang_rt.asan%%s%s.lib" % config.target_suffix
         )
-        config.substitutions.append(("%asan_lib", base_lib % "_dynamic"))
-        if config.asan_dynamic:
-            config.substitutions.append(
-                ("%asan_thunk", base_lib % "_dynamic_runtime_thunk")
-            )
-        else:
-            config.substitutions.append(
-                ("%asan_thunk", base_lib % "_static_runtime_thunk")
-            )
+        config.substitutions.append(("%asan_lib", base_lib % ""))
         config.substitutions.append(("%asan_cxx_lib", base_lib % "_cxx"))
-        config.substitutions.append(
-            ("%asan_dynamic_runtime_thunk", base_lib % "_dynamic_runtime_thunk")
-        )
-        config.substitutions.append(
-            ("%asan_static_runtime_thunk", base_lib % "_static_runtime_thunk")
-        )
         config.substitutions.append(("%asan_dll_thunk", base_lib % "_dll_thunk"))
     else:
         # To make some of these tests work on MinGW target without changing their
@@ -281,7 +262,7 @@ def build_invocation(compile_flags, with_lto=False):
 
 # Add the RT libdir to PATH directly so that we can successfully run the gtest
 # binary to list its tests.
-if config.host_os == "Windows":
+if config.host_os == "Windows" and config.asan_dynamic:
     os.environ["PATH"] = os.path.pathsep.join(
         [config.compiler_rt_libdir, os.environ.get("PATH", "")]
     )

From f32ebabc27655a1bd26ccdede1610d8d1a05315f Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Wed, 29 May 2024 18:09:21 -0700
Subject: [PATCH 211/230] [NVPTX] Improve folding to mad with immediate 1
 (#93628)

Extend NVPTX DAG combining logic to distribute a mul instruction across
an add of 1 into a mad where possible. In addition, add support for
transposing a mul through a select with an option of 1, if that would
allow further mul folding.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  98 +++++++++++++-
 llvm/test/CodeGen/NVPTX/combine-mad.ll      | 136 ++++++++++++++++++++
 2 files changed, 228 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/combine-mad.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1e7477cf9d60e3..f4ef7c9914f131 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5614,17 +5614,103 @@ static SDValue TryMULWIDECombine(SDNode *N,
   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
 }
 
+static bool isConstOne(const SDValue &Operand) {
+  const auto *Const = dyn_cast<ConstantSDNode>(Operand);
+  return Const && Const->getZExtValue() == 1;
+}
+
+static SDValue matchMADConstOnePattern(SDValue Add) {
+  if (Add->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  if (isConstOne(Add->getOperand(0)))
+    return Add->getOperand(1);
+
+  if (isConstOne(Add->getOperand(1)))
+    return Add->getOperand(0);
+
+  return SDValue();
+}
+
+static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+
+  if (SDValue Y = matchMADConstOnePattern(Add))
+    return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
+
+  return SDValue();
+}
+
+static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,
+                                        SDLoc DL,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  if (Select->getOpcode() != ISD::SELECT)
+    return SDValue();
+
+  SDValue Cond = Select->getOperand(0);
+
+  unsigned ConstOpNo;
+  if (isConstOne(Select->getOperand(1)))
+    ConstOpNo = 1;
+  else if (isConstOne(Select->getOperand(2)))
+    ConstOpNo = 2;
+  else
+    return SDValue();
+
+  SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
+
+  // Do not combine if the resulting sequence is not obviously profitable.
+  if (!matchMADConstOnePattern(Y))
+    return SDValue();
+
+  SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
+
+  return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                         (ConstOpNo == 1) ? X : NewMul,
+                         (ConstOpNo == 1) ? NewMul : X);
+}
+
+static SDValue
+PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                              TargetLowering::DAGCombinerInfo &DCI) {
+
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // (mul x, (add y, 1)) -> (mad x, y, x)
+  if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
+    return Res;
+  if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
+    return Res;
+
+  // (mul x, (select y, 1)) -> (select (mul x, y), x)
+  if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
+    return Res;
+  if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
+    return Res;
+
+  return SDValue();
+}
+
 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
 static SDValue PerformMULCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  CodeGenOptLevel OptLevel) {
-  if (OptLevel > CodeGenOptLevel::None) {
-    // Try mul.wide combining at OptLevel > 0
-    if (SDValue Ret = TryMULWIDECombine(N, DCI))
-      return Ret;
-  }
+  if (OptLevel == CodeGenOptLevel::None)
+    return SDValue();
 
-  return SDValue();
+  if (SDValue Ret = TryMULWIDECombine(N, DCI))
+    return Ret;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  return PerformMULCombineWithOperands(N, N0, N1, DCI);
 }
 
 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
new file mode 100644
index 00000000000000..fba389afdca392
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | %ptxas-verify %}
+
+define i32 @test1(i32 %n, i32 %m) {
+;
+; CHECK-LABEL: test1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test1_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test1_param_1];
+; CHECK-NEXT:    mad.lo.s32 %r3, %r2, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 1
+  %mul = mul i32 %add, %m
+  ret i32 %mul
+}
+
+define i32 @test1_rev(i32 %n, i32 %m) {
+;
+; CHECK-LABEL: test1_rev(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test1_rev_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test1_rev_param_1];
+; CHECK-NEXT:    mad.lo.s32 %r3, %r2, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 1
+  %mul = mul i32 %m, %add
+  ret i32 %mul
+}
+
+; Transpose (mul (select)) if it can then be folded to mad
+define i32 @test2(i32 %n, i32 %m, i32 %s) {
+;
+; CHECK-LABEL: test2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test2_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test2_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test2_param_2];
+; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
+; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r5, %r2, %r4, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 1
+  %cond = icmp slt i32 %s, 1
+  %sel = select i1 %cond, i32 1, i32 %add
+  %mul = mul i32 %sel, %m
+  ret i32 %mul
+}
+
+;; Transpose (mul (select)) if it can then be folded to mad
+define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) {
+;
+; CHECK-LABEL: test2_rev1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test2_rev1_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test2_rev1_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test2_rev1_param_2];
+; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
+; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r5, %r4, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 1
+  %cond = icmp slt i32 %s, 1
+  %sel = select i1 %cond, i32 %add, i32 1
+  %mul = mul i32 %sel, %m
+  ret i32 %mul
+}
+
+;; Transpose (mul (select)) if it can then be folded to mad
+define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) {
+;
+; CHECK-LABEL: test2_rev2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test2_rev2_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test2_rev2_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test2_rev2_param_2];
+; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
+; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r5, %r4, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 1
+  %cond = icmp slt i32 %s, 1
+  %sel = select i1 %cond, i32 %add, i32 1
+  %mul = mul i32  %m, %sel
+  ret i32 %mul
+}
+
+;; Leave (mul (select)) intact if it transposing is not profitable
+define i32 @test3(i32 %n, i32 %m, i32 %s) {
+;
+; CHECK-LABEL: test3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test3_param_0];
+; CHECK-NEXT:    add.s32 %r2, %r1, 3;
+; CHECK-NEXT:    ld.param.u32 %r3, [test3_param_1];
+; CHECK-NEXT:    ld.param.u32 %r4, [test3_param_2];
+; CHECK-NEXT:    setp.lt.s32 %p1, %r4, 1;
+; CHECK-NEXT:    selp.b32 %r5, 1, %r2, %p1;
+; CHECK-NEXT:    mul.lo.s32 %r6, %r5, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r6;
+; CHECK-NEXT:    ret;
+  %add = add i32 %n, 3
+  %cond = icmp slt i32 %s, 1
+  %sel = select i1 %cond, i32 1, i32 %add
+  %mul = mul i32 %sel, %m
+  ret i32 %mul
+}

From 2b1d1c51f6e321267cc86e9db7808298c59caf0e Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Wed, 29 May 2024 21:47:20 -0400
Subject: [PATCH 212/230] [NFC] Fix PPC buildbot failure
 https://lab.llvm.org/buildbot/#/builders/230/builds/29066

Failure was introduced in https://github.com/llvm/llvm-project/pull/81545

On 64-bit targets for i32 return type, there will be extension in the function
prototype.
---
 clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
index 0885e7076d51c5..9f2a3f9e69197b 100644
--- a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
+++ b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
@@ -22,7 +22,7 @@ class C {
   A arr[10];
 };
 
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func1{{.*}}(
+// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func1{{.*}}(
 // CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.B, ptr {{%.*}}, i32 0, i32 0, !dbg [[DBG1:![0-9]+]]
 // CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]], align {{.*}}, !dbg [[DBG1]]
 // CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr, align {{.*}}, !dbg [[DBG1]]
@@ -48,7 +48,7 @@ A* func2(void *b) {
 }
 
 // Should not generate pseudo variable in this case.
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func3{{.*}}(
+// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func3{{.*}}(
 // CHECK:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META4:![0-9]+]], metadata !DIExpression())
 // CHECK:    call void @llvm.dbg.declare(metadata ptr [[LOCAL1:%.*]], metadata [[META5:![0-9]+]], metadata !DIExpression())
 // CHECK-NOT: call void @llvm.dbg.declare(metadata ptr
@@ -89,7 +89,7 @@ char func5(void *arr, int n) {
   return ((A*)arr)[n].c;
 }
 
-// CHECK-LABEL: define dso_local noundef i32 @{{.*}}func6{{.*}}(
+// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func6{{.*}}(
 // CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META10:![0-9]+]], metadata !DIExpression())
 // CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META11:![0-9]+]], metadata !DIExpression())
 int func6(B &b) {

From 30c10fda2ba539e70bff4f05625ec6358c0f7502 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 19:04:08 -0700
Subject: [PATCH 213/230] Revert "[ELF] Simplify getSectionRank"

This reverts commit 2e0cfe69d0d705e9c5d5f217625bf7e3a0e90871.

Buildbots are broken.
---
 lld/ELF/Writer.cpp | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index c498153f3348b1..d2cc6d8ff5f2cb 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -618,6 +618,7 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
+  RF_NOT_SPECIAL = 1 << 17,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
@@ -643,6 +644,24 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   if (!(osec.flags & SHF_ALLOC))
     return rank | RF_NOT_ALLOC;
 
+  if (osec.type == SHT_LLVM_PART_EHDR)
+    return rank;
+  if (osec.type == SHT_LLVM_PART_PHDR)
+    return rank | 1;
+
+  // Put .interp first because some loaders want to see that section
+  // on the first page of the executable file when loaded into memory.
+  if (osec.name == ".interp")
+    return rank | 2;
+
+  // Put .note sections at the beginning so that they are likely to be included
+  // in a truncate core file. In particular, .note.gnu.build-id, if available,
+  // can identify the object file.
+  if (osec.type == SHT_NOTE)
+    return rank | 3;
+
+  rank |= RF_NOT_SPECIAL;
+
   // Sort sections based on their access permission in the following
   // order: R, RX, RXW, RW(RELRO), RW(non-RELRO).
   //
@@ -658,6 +677,11 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   bool isWrite = osec.flags & SHF_WRITE;
 
   if (!isWrite && !isExec) {
+    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
+    // alleviate relocation overflow pressure. Large special sections such as
+    // .dynstr and .dynsym can be away from .text.
+    if (osec.type == SHT_PROGBITS)
+      rank |= RF_RODATA;
     // Among PROGBITS sections, place .lrodata further from .text.
     // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This
     // layout has one extra PT_LOAD, but alleviates relocation overflow
@@ -667,25 +691,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
       rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= config->zLrodataAfterBss ? 0 : RF_LARGE;
-
-    if (osec.type == SHT_LLVM_PART_EHDR)
-      ;
-    else if (osec.type == SHT_LLVM_PART_PHDR)
-      rank |= 1;
-    else if (osec.name == ".interp")
-      rank |= 2;
-    // Put .note sections at the beginning so that they are likely to be
-    // included in a truncate core file. In particular, .note.gnu.build-id, if
-    // available, can identify the object file.
-    else if (osec.type == SHT_NOTE)
-      rank |= 3;
-    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
-    // alleviate relocation overflow pressure. Large special sections such as
-    // .dynstr and .dynsym can be away from .text.
-    else if (osec.type != SHT_PROGBITS)
-      rank |= 4;
-    else
-      rank |= RF_RODATA;
   } else if (isExec) {
     rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
   } else {

From 96d2dc7210db3ed3a4c9f6aa93c14d2ea90e67cc Mon Sep 17 00:00:00 2001
From: csstormq <swust_xiaoqiangxu@163.com>
Date: Thu, 30 May 2024 10:50:32 +0800
Subject: [PATCH 214/230] [SCEVAA] Enhance SCEVAAResult::alias() to handle two
 pointers with different pointer bases (#91453)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enhances the SCEVAAResult::alias() interface to handle two
pointers with different pointer bases.
﻿
Before calling getMinusSCEV(), we firstly try to explicitly convert
these two pointers into ptrtoint expressions to do that.
﻿
Either both pointers are used with ptrtoint or neither, so we can't
end up with a ptr + int mix.
---
 .../Analysis/ScalarEvolutionAliasAnalysis.cpp | 13 ++++++++++
 llvm/test/Analysis/ScalarEvolution/scev-aa.ll | 26 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index af8232b03f1ed6..7bcec7931b2191 100644
--- a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -61,6 +61,19 @@ AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
                                  ? static_cast<uint64_t>(LocB.Size.getValue())
                                  : MemoryLocation::UnknownSize);
 
+    // Firstly, try to convert the two pointers into ptrtoint expressions to
+    // handle two pointers with different pointer bases.
+    // Either both pointers are used with ptrtoint or neither, so we can't end
+    // up with a ptr + int mix.
+    const SCEV *AInt =
+        SE.getPtrToIntExpr(AS, SE.getEffectiveSCEVType(AS->getType()));
+    const SCEV *BInt =
+        SE.getPtrToIntExpr(BS, SE.getEffectiveSCEVType(BS->getType()));
+    if (!isa<SCEVCouldNotCompute>(AInt) && !isa<SCEVCouldNotCompute>(BInt)) {
+      AS = AInt;
+      BS = BInt;
+    }
+
     // Compute the difference between the two pointers.
     const SCEV *BA = SE.getMinusSCEV(BS, AS);
 
diff --git a/llvm/test/Analysis/ScalarEvolution/scev-aa.ll b/llvm/test/Analysis/ScalarEvolution/scev-aa.ll
index a81baa73a93bd3..5610833e9c4745 100644
--- a/llvm/test/Analysis/ScalarEvolution/scev-aa.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scev-aa.ll
@@ -340,3 +340,29 @@ for.latch:
 for.end:
   ret void
 }
+
+; CHECK-LABEL: Function: test_different_pointer_bases_of_inttoptr: 2 pointers, 0 call sites
+; CHECK:   NoAlias:	<16 x i8>* %tmp5, <16 x i8>* %tmp7
+
+define void @test_different_pointer_bases_of_inttoptr() {
+entry:
+  br label %for.body
+
+for.body:
+  %tmp = phi i32 [ %next, %for.body ], [ 1, %entry ]
+  %tmp1 = shl nsw i32 %tmp, 1
+  %tmp2 = add nuw nsw i32 %tmp1, %tmp1
+  %tmp3 = mul nsw i32 %tmp2, 1408
+  %tmp4 = add nsw i32 %tmp3, 1408
+  %tmp5 = getelementptr inbounds i8, ptr inttoptr (i32 1024 to ptr), i32 %tmp1
+  %tmp6 = load <16 x i8>, ptr %tmp5, align 1
+  %tmp7 = getelementptr inbounds i8, ptr inttoptr (i32 4096 to ptr), i32 %tmp4
+  store <16 x i8> %tmp6, ptr %tmp7, align 1
+
+  %next = add i32 %tmp, 2
+  %exitcond = icmp slt i32 %next, 10
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

From 8c33b3380b8044824f6adb48cc8d2076aecae566 Mon Sep 17 00:00:00 2001
From: Pavel Samolysov <samolisov@gmail.com>
Date: Thu, 30 May 2024 05:51:29 +0300
Subject: [PATCH 215/230] [PGO] Add a unit test for the PGOInstrumentationGen
 pass (#93636)

The patch introduces the gmock-based unittest infrastructure for PGO
Instrumentation and adds some test cases to check whether the
instrumentation has taken place. The testing infrastructure for analysis
modules was borrowed from the LoopPassManagerTest unittest and
simplified a bit to handle module analysis passes only. Actually, we are
testing whether the result of a trivial analysis pass was invalidated by
the PGOInstrumentGen one: we exploit the fact the pass invalidates all
the analysis results after a module was instrumented.

NFC.
---
 llvm/unittests/Transforms/CMakeLists.txt      |   1 +
 .../Transforms/Instrumentation/CMakeLists.txt |  16 ++
 .../PGOInstrumentationTest.cpp                | 192 ++++++++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 llvm/unittests/Transforms/Instrumentation/CMakeLists.txt
 create mode 100644 llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp

diff --git a/llvm/unittests/Transforms/CMakeLists.txt b/llvm/unittests/Transforms/CMakeLists.txt
index 98c821acde3a51..320cdf56741496 100644
--- a/llvm/unittests/Transforms/CMakeLists.txt
+++ b/llvm/unittests/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Coroutines)
+add_subdirectory(Instrumentation)
 add_subdirectory(IPO)
 add_subdirectory(Scalar)
 add_subdirectory(Utils)
diff --git a/llvm/unittests/Transforms/Instrumentation/CMakeLists.txt b/llvm/unittests/Transforms/Instrumentation/CMakeLists.txt
new file mode 100644
index 00000000000000..1f249b0049d062
--- /dev/null
+++ b/llvm/unittests/Transforms/Instrumentation/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  AsmParser
+  Core
+  Instrumentation
+  Passes
+  Support
+)
+
+add_llvm_unittest(InstrumentationTests
+  PGOInstrumentationTest.cpp
+  )
+
+target_link_libraries(InstrumentationTests PRIVATE LLVMTestingSupport)
+
+set_property(TARGET InstrumentationTests PROPERTY FOLDER "Tests/UnitTests/TransformTests")
diff --git a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
new file mode 100644
index 00000000000000..02c2df2a138b02
--- /dev/null
+++ b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
@@ -0,0 +1,192 @@
+//===- PGOInstrumentationTest.cpp - Instrumentation unit tests ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/ProfileData/InstrProf.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <tuple>
+
+namespace {
+
+using namespace llvm;
+
+using testing::_;
+using ::testing::DoDefault;
+using ::testing::Invoke;
+using ::testing::IsNull;
+using ::testing::NotNull;
+using ::testing::Ref;
+using ::testing::Return;
+using ::testing::Sequence;
+using ::testing::Test;
+using ::testing::TestParamInfo;
+using ::testing::Values;
+using ::testing::WithParamInterface;
+
+template <typename Derived> class MockAnalysisHandleBase {
+public:
+  class Analysis : public AnalysisInfoMixin<Analysis> {
+  public:
+    class Result {
+    public:
+      // Forward invalidation events to the mock handle.
+      bool invalidate(Module &M, const PreservedAnalyses &PA,
+                      ModuleAnalysisManager::Invalidator &Inv) {
+        return Handle->invalidate(M, PA, Inv);
+      }
+
+    private:
+      explicit Result(Derived *Handle) : Handle(Handle) {}
+
+      friend MockAnalysisHandleBase;
+      Derived *Handle;
+    };
+
+    Result run(Module &M, ModuleAnalysisManager &AM) {
+      return Handle->run(M, AM);
+    }
+
+  private:
+    friend AnalysisInfoMixin<Analysis>;
+    friend MockAnalysisHandleBase;
+    static inline AnalysisKey Key;
+
+    Derived *Handle;
+
+    explicit Analysis(Derived *Handle) : Handle(Handle) {}
+  };
+
+  Analysis getAnalysis() { return Analysis(static_cast<Derived *>(this)); }
+
+  typename Analysis::Result getResult() {
+    return typename Analysis::Result(static_cast<Derived *>(this));
+  }
+
+protected:
+  void setDefaults() {
+    ON_CALL(static_cast<Derived &>(*this), run(_, _))
+        .WillByDefault(Return(this->getResult()));
+    ON_CALL(static_cast<Derived &>(*this), invalidate(_, _, _))
+        .WillByDefault(Invoke([](Module &M, const PreservedAnalyses &PA,
+                                 ModuleAnalysisManager::Invalidator &) {
+          auto PAC = PA.template getChecker<Analysis>();
+          return !PAC.preserved() &&
+                 !PAC.template preservedSet<AllAnalysesOn<Module>>();
+        }));
+  }
+
+private:
+  friend Derived;
+  MockAnalysisHandleBase() = default;
+};
+
+class MockModuleAnalysisHandle
+    : public MockAnalysisHandleBase<MockModuleAnalysisHandle> {
+public:
+  MockModuleAnalysisHandle() { setDefaults(); }
+
+  MOCK_METHOD(typename Analysis::Result, run,
+              (Module &, ModuleAnalysisManager &));
+
+  MOCK_METHOD(bool, invalidate,
+              (Module &, const PreservedAnalyses &,
+               ModuleAnalysisManager::Invalidator &));
+};
+
+struct PGOInstrumentationGenTest
+    : public Test,
+      WithParamInterface<std::tuple<StringRef, StringRef>> {
+  LLVMContext Ctx;
+  ModulePassManager MPM;
+  PassBuilder PB;
+  MockModuleAnalysisHandle MMAHandle;
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  LLVMContext Context;
+  std::unique_ptr<Module> M;
+
+  PGOInstrumentationGenTest() {
+    MAM.registerPass([&] { return MMAHandle.getAnalysis(); });
+    PB.registerModuleAnalyses(MAM);
+    PB.registerCGSCCAnalyses(CGAM);
+    PB.registerFunctionAnalyses(FAM);
+    PB.registerLoopAnalyses(LAM);
+    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+    MPM.addPass(
+        RequireAnalysisPass<MockModuleAnalysisHandle::Analysis, Module>());
+    MPM.addPass(PGOInstrumentationGen());
+  }
+
+  void parseAssembly(const StringRef IR) {
+    SMDiagnostic Error;
+    M = parseAssemblyString(IR, Error, Context);
+    std::string ErrMsg;
+    raw_string_ostream OS(ErrMsg);
+    Error.print("", OS);
+
+    // A failure here means that the test itself is buggy.
+    if (!M)
+      report_fatal_error(OS.str().c_str());
+  }
+};
+
+static constexpr StringRef CodeWithFuncDefs = R"(
+  define i32 @f(i32 %n) {
+  entry:
+    ret i32 0
+  })";
+
+static constexpr StringRef CodeWithFuncDecls = R"(
+  declare i32 @f(i32);
+)";
+
+static constexpr StringRef CodeWithGlobals = R"(
+  @foo.table = internal unnamed_addr constant [1 x ptr] [ptr @f]
+  declare i32 @f(i32);
+)";
+
+INSTANTIATE_TEST_SUITE_P(
+    PGOInstrumetationGenTestSuite, PGOInstrumentationGenTest,
+    Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs"),
+           std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
+           std::make_tuple(CodeWithGlobals, "instrument_globals")),
+    [](const TestParamInfo<PGOInstrumentationGenTest::ParamType> &Info) {
+      return std::get<1>(Info.param).str();
+    });
+
+TEST_P(PGOInstrumentationGenTest, Instrumented) {
+  const StringRef Code = std::get<0>(GetParam());
+  parseAssembly(Code);
+
+  ASSERT_THAT(M, NotNull());
+
+  Sequence PassSequence;
+  EXPECT_CALL(MMAHandle, run(Ref(*M), _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
+
+  MPM.run(*M, MAM);
+
+  const auto *IRInstrVar =
+      M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+  EXPECT_THAT(IRInstrVar, NotNull());
+  EXPECT_FALSE(IRInstrVar->isDeclaration());
+}
+
+} // end anonymous namespace

From f639b57f7993cadb82ee9c36f04703ae4430ed85 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 29 May 2024 20:08:05 -0700
Subject: [PATCH 216/230] [ELF] Simplify getSectionRank

Follow-up to a previous simplification
2473b1af085ad54e89666cedf684fdf10a84f058.

The xor difference between a SHT_NOTE and a read-only SHT_PROGBITS
(previously >=NOT_SPECIAL) should be smaller than RF_EXEC. Otherwise,
for the following section layout, `findOrphanPos` would place .text
before note.

```
// simplified from linkerscript/custom-section-type.s
non orphans:
progbits 0x8060c00 NOT_SPECIAL
note     0x8040003

orphan:
.text    0x8061000 NOT_SPECIAL
```

---

Identical to 2e0cfe69d0d705e9c5d5f217625bf7e3a0e90871.
The revert 30c10fda2ba539e70bff4f05625ec6358c0f7502 is wrong.
---
 lld/ELF/Writer.cpp | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index d2cc6d8ff5f2cb..c498153f3348b1 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -618,7 +618,6 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
-  RF_NOT_SPECIAL = 1 << 17,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
@@ -644,24 +643,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   if (!(osec.flags & SHF_ALLOC))
     return rank | RF_NOT_ALLOC;
 
-  if (osec.type == SHT_LLVM_PART_EHDR)
-    return rank;
-  if (osec.type == SHT_LLVM_PART_PHDR)
-    return rank | 1;
-
-  // Put .interp first because some loaders want to see that section
-  // on the first page of the executable file when loaded into memory.
-  if (osec.name == ".interp")
-    return rank | 2;
-
-  // Put .note sections at the beginning so that they are likely to be included
-  // in a truncate core file. In particular, .note.gnu.build-id, if available,
-  // can identify the object file.
-  if (osec.type == SHT_NOTE)
-    return rank | 3;
-
-  rank |= RF_NOT_SPECIAL;
-
   // Sort sections based on their access permission in the following
   // order: R, RX, RXW, RW(RELRO), RW(non-RELRO).
   //
@@ -677,11 +658,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   bool isWrite = osec.flags & SHF_WRITE;
 
   if (!isWrite && !isExec) {
-    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
-    // alleviate relocation overflow pressure. Large special sections such as
-    // .dynstr and .dynsym can be away from .text.
-    if (osec.type == SHT_PROGBITS)
-      rank |= RF_RODATA;
     // Among PROGBITS sections, place .lrodata further from .text.
     // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This
     // layout has one extra PT_LOAD, but alleviates relocation overflow
@@ -691,6 +667,25 @@ unsigned elf::getSectionRank(OutputSection &osec) {
       rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= config->zLrodataAfterBss ? 0 : RF_LARGE;
+
+    if (osec.type == SHT_LLVM_PART_EHDR)
+      ;
+    else if (osec.type == SHT_LLVM_PART_PHDR)
+      rank |= 1;
+    else if (osec.name == ".interp")
+      rank |= 2;
+    // Put .note sections at the beginning so that they are likely to be
+    // included in a truncate core file. In particular, .note.gnu.build-id, if
+    // available, can identify the object file.
+    else if (osec.type == SHT_NOTE)
+      rank |= 3;
+    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
+    // alleviate relocation overflow pressure. Large special sections such as
+    // .dynstr and .dynsym can be away from .text.
+    else if (osec.type != SHT_PROGBITS)
+      rank |= 4;
+    else
+      rank |= RF_RODATA;
   } else if (isExec) {
     rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
   } else {

From d38d0a0d1bd219555f130dd63e2599f5126e1bdd Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 20:32:34 -0700
Subject: [PATCH 217/230] Revert "[ELF] Simplify getSectionRank"

This reverts commit f639b57f7993cadb82ee9c36f04703ae4430ed85.

The premerge bot is still broken with failing bolt test.
---
 lld/ELF/Writer.cpp | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index c498153f3348b1..d2cc6d8ff5f2cb 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -618,6 +618,7 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
+  RF_NOT_SPECIAL = 1 << 17,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
@@ -643,6 +644,24 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   if (!(osec.flags & SHF_ALLOC))
     return rank | RF_NOT_ALLOC;
 
+  if (osec.type == SHT_LLVM_PART_EHDR)
+    return rank;
+  if (osec.type == SHT_LLVM_PART_PHDR)
+    return rank | 1;
+
+  // Put .interp first because some loaders want to see that section
+  // on the first page of the executable file when loaded into memory.
+  if (osec.name == ".interp")
+    return rank | 2;
+
+  // Put .note sections at the beginning so that they are likely to be included
+  // in a truncate core file. In particular, .note.gnu.build-id, if available,
+  // can identify the object file.
+  if (osec.type == SHT_NOTE)
+    return rank | 3;
+
+  rank |= RF_NOT_SPECIAL;
+
   // Sort sections based on their access permission in the following
   // order: R, RX, RXW, RW(RELRO), RW(non-RELRO).
   //
@@ -658,6 +677,11 @@ unsigned elf::getSectionRank(OutputSection &osec) {
   bool isWrite = osec.flags & SHF_WRITE;
 
   if (!isWrite && !isExec) {
+    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
+    // alleviate relocation overflow pressure. Large special sections such as
+    // .dynstr and .dynsym can be away from .text.
+    if (osec.type == SHT_PROGBITS)
+      rank |= RF_RODATA;
     // Among PROGBITS sections, place .lrodata further from .text.
     // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This
     // layout has one extra PT_LOAD, but alleviates relocation overflow
@@ -667,25 +691,6 @@ unsigned elf::getSectionRank(OutputSection &osec) {
       rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= config->zLrodataAfterBss ? 0 : RF_LARGE;
-
-    if (osec.type == SHT_LLVM_PART_EHDR)
-      ;
-    else if (osec.type == SHT_LLVM_PART_PHDR)
-      rank |= 1;
-    else if (osec.name == ".interp")
-      rank |= 2;
-    // Put .note sections at the beginning so that they are likely to be
-    // included in a truncate core file. In particular, .note.gnu.build-id, if
-    // available, can identify the object file.
-    else if (osec.type == SHT_NOTE)
-      rank |= 3;
-    // Make PROGBITS sections (e.g .rodata .eh_frame) closer to .text to
-    // alleviate relocation overflow pressure. Large special sections such as
-    // .dynstr and .dynsym can be away from .text.
-    else if (osec.type != SHT_PROGBITS)
-      rank |= 4;
-    else
-      rank |= RF_RODATA;
   } else if (isExec) {
     rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
   } else {

From 815250b219a04966e4ea5de3a09965bea4d4cc41 Mon Sep 17 00:00:00 2001
From: Mark Rowe <mrowe@bdash.net.nz>
Date: Wed, 29 May 2024 20:56:05 -0700
Subject: [PATCH 218/230] [compiler-rt] Don't rely on automatic codesigning
 with Apple's linker (#91681)

In https://github.com/llvm/llvm-project/pull/88323, I changed the logic
within `add_compiler_rt_runtime` to only explicitly code sign the
resulting library if an older version of Apple's ld64 was in use. This
was based on the assumption that newer versions of ld64 and the new
Apple linker always ad-hoc sign their output binaries. This is true in
most cases, but not when using Apple's new linker with the
`-darwin-target-variant` flag to build Mac binaries that are compatible
with Catalyst.

Rather than adding increasingly complicated logic to detect the exact
scenarios that require explicit code signing, I've opted to always
explicitly code sign when using any Apple linker. We instead detect and
use the 'linker-signed' codesigning option when possible to match the
signatures that the linker would otherwise create. This avoids having
non-'linker-signed' ad-hoc signatures which was the underlying problem
that https://github.com/llvm/llvm-project/pull/88323 was intended to
address.

Co-authored-by: Mark Rowe <markrowe@chromium.org>
---
 compiler-rt/cmake/Modules/AddCompilerRT.cmake | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 75b34c8e27e000..9ec2eecf801bcd 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -387,35 +387,35 @@ function(add_compiler_rt_runtime name type)
         set_target_properties(${libname} PROPERTIES IMPORT_SUFFIX ".lib")
       endif()
       if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*")
-        # Ad-hoc sign the dylibs when using Xcode versions older than 12.
-        # Xcode 12 shipped with ld64-609.
-        # FIXME: Remove whole conditional block once everything uses Xcode 12+.
-        set(LD_V_OUTPUT)
+        # Apple's linker signs the resulting dylib with an ad-hoc code signature in
+        # most situations, except:
+        # 1. Versions of ld64 prior to ld64-609 in Xcode 12 predate this behavior.
+        # 2. Apple's new linker does not when building with `-darwin-target-variant`
+        #    to support macOS Catalyst.
+        #
+        # Explicitly re-signing the dylib works around both of these issues. The
+        # signature is marked as `linker-signed` when that is supported so that it
+        # behaves as expected when processed by subsequent tooling.
+        #
+        # Detect whether `codesign` supports `-o linker-signed` by passing it as an
+        # argument and looking for `invalid argument "linker-signed"` in its output.
+        # FIXME: Remove this once all supported toolchains support `-o linker-signed`.
         execute_process(
-          COMMAND sh -c "${CMAKE_LINKER} -v 2>&1 | head -1"
-          RESULT_VARIABLE HAD_ERROR
-          OUTPUT_VARIABLE LD_V_OUTPUT
+          COMMAND sh -c "codesign -f -s - -o linker-signed this-does-not-exist 2>&1 | grep -q linker-signed"
+          RESULT_VARIABLE CODESIGN_SUPPORTS_LINKER_SIGNED
         )
-        if (HAD_ERROR)
-          message(FATAL_ERROR "${CMAKE_LINKER} failed with status ${HAD_ERROR}")
-        endif()
-        set(NEED_EXPLICIT_ADHOC_CODESIGN 0)
-        # Apple introduced a new linker by default in Xcode 15. This linker reports itself as ld
-        # rather than ld64 and does not match this version regex. That's ok since it never needs
-        # the explicit ad-hoc code signature.
-        if ("${LD_V_OUTPUT}" MATCHES ".*ld64-([0-9.]+).*")
-          string(REGEX REPLACE ".*ld64-([0-9.]+).*" "\\1" HOST_LINK_VERSION ${LD_V_OUTPUT})
-          if (HOST_LINK_VERSION VERSION_LESS 609)
-            set(NEED_EXPLICIT_ADHOC_CODESIGN 1)
-          endif()
-        endif()
-        if (NEED_EXPLICIT_ADHOC_CODESIGN)
-          add_custom_command(TARGET ${libname}
-            POST_BUILD
-            COMMAND codesign --sign - $<TARGET_FILE:${libname}>
-            WORKING_DIRECTORY ${COMPILER_RT_OUTPUT_LIBRARY_DIR}
-          )
+
+        set(EXTRA_CODESIGN_ARGUMENTS)
+        if (CODESIGN_SUPPORTS_LINKER_SIGNED)
+          list(APPEND EXTRA_CODESIGN_ARGUMENTS -o linker-signed)
         endif()
+
+        add_custom_command(TARGET ${libname}
+          POST_BUILD
+          COMMAND codesign --sign - ${EXTRA_CODESIGN_ARGUMENTS} $<TARGET_FILE:${libname}>
+          WORKING_DIRECTORY ${COMPILER_RT_OUTPUT_LIBRARY_DIR}
+          COMMAND_EXPAND_LISTS
+        )
       endif()
     endif()
 

From 17940465364e0ad66fa364c5bef8abec4e34ac5b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 29 May 2024 21:01:28 -0700
Subject: [PATCH 219/230] [VPlan] Move verifier to class to reduce need to pass
 via args. (NFC)

Move VPlan verification functions to avoid the need to pass VPDT across
multiple calls. This also allows easier extensions in the future.
---
 .../Transforms/Vectorize/VPlanVerifier.cpp    | 91 ++++++++++++-------
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ebdb914fb852f..2fe487f972bb9d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -23,10 +23,41 @@
 
 using namespace llvm;
 
-// Verify that phi-like recipes are at the beginning of \p VPBB, with no
-// other recipes in between. Also check that only header blocks contain
-// VPHeaderPHIRecipes.
-static bool verifyPhiRecipes(const VPBasicBlock *VPBB) {
+namespace {
+class VPlanVerifier {
+  const VPDominatorTree &VPDT;
+
+  // Verify that phi-like recipes are at the beginning of \p VPBB, with no
+  // other recipes in between. Also check that only header blocks contain
+  // VPHeaderPHIRecipes.
+  bool verifyPhiRecipes(const VPBasicBlock *VPBB);
+
+  bool verifyVPBasicBlock(const VPBasicBlock *VPBB);
+
+  bool verifyBlock(const VPBlockBase *VPB);
+
+  /// Helper function that verifies the CFG invariants of the VPBlockBases
+  /// within
+  /// \p Region. Checks in this function are generic for VPBlockBases. They are
+  /// not specific for VPBasicBlocks or VPRegionBlocks.
+  bool verifyBlocksInRegion(const VPRegionBlock *Region);
+
+  /// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+  /// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+  bool verifyRegion(const VPRegionBlock *Region);
+
+  /// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+  /// VPBlockBases. Recurse inside nested VPRegionBlocks.
+  bool verifyRegionRec(const VPRegionBlock *Region);
+
+public:
+  VPlanVerifier(VPDominatorTree &VPDT) : VPDT(VPDT) {}
+
+  bool verify(const VPlan &Plan);
+};
+} // namespace
+
+bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
   auto RecipeI = VPBB->begin();
   auto End = VPBB->end();
   unsigned NumActiveLaneMaskPhiRecipes = 0;
@@ -80,8 +111,7 @@ static bool verifyPhiRecipes(const VPBasicBlock *VPBB) {
   return true;
 }
 
-static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
-                               const VPDominatorTree &VPDT) {
+bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
   if (!verifyPhiRecipes(VPBB))
     return false;
 
@@ -133,7 +163,7 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
   return false;
 }
 
-static bool verifyBlock(const VPBlockBase *VPB, const VPDominatorTree &VPDT) {
+bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) {
   auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
   // Check block's condition bit.
   if (VPB->getNumSuccessors() > 1 ||
@@ -193,14 +223,10 @@ static bool verifyBlock(const VPBlockBase *VPB, const VPDominatorTree &VPDT) {
       return false;
     }
   }
-  return !VPBB || verifyVPBasicBlock(VPBB, VPDT);
+  return !VPBB || verifyVPBasicBlock(VPBB);
 }
 
-/// Helper function that verifies the CFG invariants of the VPBlockBases within
-/// \p Region. Checks in this function are generic for VPBlockBases. They are
-/// not specific for VPBasicBlocks or VPRegionBlocks.
-static bool verifyBlocksInRegion(const VPRegionBlock *Region,
-                                 const VPDominatorTree &VPDT) {
+bool VPlanVerifier::verifyBlocksInRegion(const VPRegionBlock *Region) {
   for (const VPBlockBase *VPB : vp_depth_first_shallow(Region->getEntry())) {
     // Check block's parent.
     if (VPB->getParent() != Region) {
@@ -208,16 +234,13 @@ static bool verifyBlocksInRegion(const VPRegionBlock *Region,
       return false;
     }
 
-    if (!verifyBlock(VPB, VPDT))
+    if (!verifyBlock(VPB))
       return false;
   }
   return true;
 }
 
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
-/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
-static bool verifyRegion(const VPRegionBlock *Region,
-                         const VPDominatorTree &VPDT) {
+bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) {
   const VPBlockBase *Entry = Region->getEntry();
   const VPBlockBase *Exiting = Region->getExiting();
 
@@ -231,33 +254,26 @@ static bool verifyRegion(const VPRegionBlock *Region,
     return false;
   }
 
-  return verifyBlocksInRegion(Region, VPDT);
+  return verifyBlocksInRegion(Region);
 }
 
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
-/// VPBlockBases. Recurse inside nested VPRegionBlocks.
-static bool verifyRegionRec(const VPRegionBlock *Region,
-                            const VPDominatorTree &VPDT) {
+bool VPlanVerifier::verifyRegionRec(const VPRegionBlock *Region) {
   // Recurse inside nested regions and check all blocks inside the region.
-  return verifyRegion(Region, VPDT) &&
+  return verifyRegion(Region) &&
          all_of(vp_depth_first_shallow(Region->getEntry()),
-                [&VPDT](const VPBlockBase *VPB) {
+                [this](const VPBlockBase *VPB) {
                   const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB);
-                  return !SubRegion || verifyRegionRec(SubRegion, VPDT);
+                  return !SubRegion || verifyRegionRec(SubRegion);
                 });
 }
 
-bool llvm::verifyVPlanIsValid(const VPlan &Plan) {
-  VPDominatorTree VPDT;
-  VPDT.recalculate(const_cast<VPlan &>(Plan));
-
-  if (any_of(
-          vp_depth_first_shallow(Plan.getEntry()),
-          [&VPDT](const VPBlockBase *VPB) { return !verifyBlock(VPB, VPDT); }))
+bool VPlanVerifier::verify(const VPlan &Plan) {
+  if (any_of(vp_depth_first_shallow(Plan.getEntry()),
+             [this](const VPBlockBase *VPB) { return !verifyBlock(VPB); }))
     return false;
 
   const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
-  if (!verifyRegionRec(TopRegion, VPDT))
+  if (!verifyRegionRec(TopRegion))
     return false;
 
   if (TopRegion->getParent()) {
@@ -305,3 +321,10 @@ bool llvm::verifyVPlanIsValid(const VPlan &Plan) {
 
   return true;
 }
+
+bool llvm::verifyVPlanIsValid(const VPlan &Plan) {
+  VPDominatorTree VPDT;
+  VPDT.recalculate(const_cast<VPlan &>(Plan));
+  VPlanVerifier Verifier(VPDT);
+  return Verifier.verify(Plan);
+}

From 3db1f3110e714ad24f7d72114b3a2c14f6c63651 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Wed, 29 May 2024 21:05:32 -0700
Subject: [PATCH 220/230] [clang-format] Fix a regression in annotating class
 decl braces (#93657)

Fixes #93604.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 3 +++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index b6f7567adc1401..bf89def05bb2dd 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -4026,6 +4026,9 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
       if (AngleNestingLevel == 0) {
         if (FormatTok->is(tok::colon)) {
           IsDerived = true;
+        } else if (FormatTok->is(tok::identifier) &&
+                   FormatTok->Previous->is(tok::coloncolon)) {
+          ClassName = FormatTok;
         } else if (FormatTok->is(tok::l_paren) &&
                    IsNonMacroIdentifier(FormatTok->Previous)) {
           break;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 6ea9c4a241dc51..3339a749df3a5f 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2914,6 +2914,11 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
   EXPECT_BRACE_KIND(Tokens[6], BK_Block);
 
+  Tokens = annotate("struct Foo<int>::Bar {};");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[7], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+
   Tokens = annotate("struct Foo<int> : Base {};");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_BRACE_KIND(Tokens[7], BK_Block);

From 32f1f5ee39985bbd0c8f21bf264a45cd5d4335f6 Mon Sep 17 00:00:00 2001
From: Pavel Samolysov <samolisov@gmail.com>
Date: Thu, 30 May 2024 07:10:26 +0300
Subject: [PATCH 221/230] [PGO] Add tests for modules with only globals and
 function declarations (#93764)

When a module contains globals and/or function declarations only, the
'__llvm_profile_raw_version' variable should not be generated because
the module was not instrumented at all.

NFC
---
 .../available_externally_functions.ll           | 17 +++++++++++++++++
 .../Transforms/PGOProfile/declarations_only.ll  | 13 +++++++++++++
 .../PGOProfile/global_variables_only.ll         |  9 +++++++++
 3 files changed, 39 insertions(+)
 create mode 100644 llvm/test/Transforms/PGOProfile/available_externally_functions.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/declarations_only.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/global_variables_only.ll

diff --git a/llvm/test/Transforms/PGOProfile/available_externally_functions.ll b/llvm/test/Transforms/PGOProfile/available_externally_functions.ll
new file mode 100644
index 00000000000000..f455ca066aa7a4
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/available_externally_functions.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN --check-prefix=GEN-COMDAT
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN-COMDAT: $__llvm_profile_raw_version = comdat any
+; GEN-COMDAT: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
+; GEN: @__profn_foo = linkonce_odr hidden constant [3 x i8] c"foo"
+; GEN: @__profn_bar = linkonce_odr hidden constant [3 x i8] c"bar"
+
+define available_externally hidden void @foo() {
+  ret void
+}
+
+define available_externally i32 @bar() {
+  ret i32 42
+}
diff --git a/llvm/test/Transforms/PGOProfile/declarations_only.ll b/llvm/test/Transforms/PGOProfile/declarations_only.ll
new file mode 100644
index 00000000000000..e7208fc264c7cb
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/declarations_only.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN --check-prefix=GEN-COMDAT
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN-COMDAT: $__llvm_profile_raw_version = comdat any
+; GEN-COMDAT: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
+; GEN-NOT: @__profn_test_1 = private constant [6 x i8] c"test_1"
+; GEN-NOT: @__profn_test_2 = private constant [6 x i8] c"test_2"
+
+declare i32 @test_1(i32 %i)
+
+declare i32 @test_2(i32 %i)
diff --git a/llvm/test/Transforms/PGOProfile/global_variables_only.ll b/llvm/test/Transforms/PGOProfile/global_variables_only.ll
new file mode 100644
index 00000000000000..3bfa29af5d34f0
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/global_variables_only.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN-COMDAT
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN-COMDAT: $__llvm_profile_raw_version = comdat any
+; GEN-COMDAT: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
+
+@var = internal unnamed_addr global [35 x ptr] zeroinitializer, align 16

From 02c6845c762dfd0a19d4a2f997990e160f392dae Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 22:48:15 -0600
Subject: [PATCH 222/230] Revert "[DebugInfo] Add flag to only emit referenced
 member functions" (#93767)

Reverts llvm/llvm-project#87018

MacOS and Windows bots are broken.
---
 clang/include/clang/Basic/DebugOptions.def        |  2 --
 clang/include/clang/Driver/Options.td             |  4 ----
 clang/lib/CodeGen/CGDebugInfo.cpp                 |  2 +-
 clang/lib/Driver/ToolChains/Clang.cpp             | 15 ---------------
 .../CodeGenCXX/debug-info-incomplete-types.cpp    | 12 ------------
 clang/test/Driver/debug-options.c                 |  8 --------
 6 files changed, 1 insertion(+), 42 deletions(-)
 delete mode 100644 clang/test/CodeGenCXX/debug-info-incomplete-types.cpp

diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index bc96d5dfdf890b..b94f6aef9ac60b 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -68,8 +68,6 @@ BENIGN_DEBUGOPT(NoInlineLineTables, 1, 0) ///< Whether debug info should contain
                                           ///< inline line tables.
 
 DEBUGOPT(DebugStrictDwarf, 1, 1) ///< Whether or not to use strict DWARF info.
-DEBUGOPT(DebugOmitUnreferencedMethods, 1, 0) ///< Omit unreferenced member
-					     ///< functions in type debug info.
 
 /// Control the Assignment Tracking debug info feature.
 BENIGN_ENUM_DEBUGOPT(AssignmentTrackingMode, AssignmentTrackingOpts, 2,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f64d7c60783e9d..4119e69c85540e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4345,10 +4345,6 @@ defm strict_dwarf : BoolOption<"g", "strict-dwarf",
           "the specified version, avoiding features from later versions.">,
   NegFlag<SetFalse>, BothFlags<[], [ClangOption, CLOption, DXCOption]>>,
   Group<g_flags_Group>;
-defm omit_unreferenced_methods : BoolGOption<"omit-unreferenced-methods",
-  CodeGenOpts<"DebugOmitUnreferencedMethods">, DefaultFalse,
-  NegFlag<SetFalse>,
-  PosFlag<SetTrue, [], [CC1Option]>, BothFlags<[], [ClangOption, CLOption, DXCOption]>>;
 defm column_info : BoolOption<"g", "column-info",
   CodeGenOpts<"DebugColumnInfo">, DefaultTrue,
   NegFlag<SetFalse, [], [ClangOption, CC1Option]>,
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 5f6f911c7a6d69..9d7107abf8a6fe 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2836,7 +2836,7 @@ CGDebugInfo::CreateTypeDefinition(const RecordType *Ty) {
 
   // Collect data fields (including static variables and any initializers).
   CollectRecordFields(RD, DefUnit, EltTys, FwdDecl);
-  if (CXXDecl && !CGM.getCodeGenOpts().DebugOmitUnreferencedMethods)
+  if (CXXDecl)
     CollectCXXMemberFunctions(CXXDecl, DefUnit, EltTys, FwdDecl);
 
   LexicalBlockStack.pop_back();
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4e1c52462e5842..97e451cfe2acb4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -45,7 +45,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CodeGen.h"
@@ -4643,7 +4642,6 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
   Args.addOptInFlag(CmdArgs, options::OPT_fforce_dwarf_frame,
                     options::OPT_fno_force_dwarf_frame);
 
-  bool EnableTypeUnits = false;
   if (Args.hasFlag(options::OPT_fdebug_types_section,
                    options::OPT_fno_debug_types_section, false)) {
     if (!(T.isOSBinFormatELF() || T.isOSBinFormatWasm())) {
@@ -4654,24 +4652,11 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
     } else if (checkDebugInfoOption(
                    Args.getLastArg(options::OPT_fdebug_types_section), Args, D,
                    TC)) {
-      EnableTypeUnits = true;
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-generate-type-units");
     }
   }
 
-  if (const Arg *A =
-          Args.getLastArg(options::OPT_gomit_unreferenced_methods,
-                          options::OPT_gno_omit_unreferenced_methods))
-    (void)checkDebugInfoOption(A, Args, D, TC);
-  if (Args.hasFlag(options::OPT_gomit_unreferenced_methods,
-                   options::OPT_gno_omit_unreferenced_methods, false) &&
-      (DebugInfoKind == llvm::codegenoptions::DebugInfoConstructor ||
-       DebugInfoKind == llvm::codegenoptions::LimitedDebugInfo) &&
-      !EnableTypeUnits) {
-    CmdArgs.push_back("-gomit-unreferenced-methods");
-  }
-
   // To avoid join/split of directory+filename, the integrated assembler prefers
   // the directory form of .file on all DWARF versions. GNU as doesn't allow the
   // form before DWARF v5.
diff --git a/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp b/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp
deleted file mode 100644
index 0bf59233b4e2eb..00000000000000
--- a/clang/test/CodeGenCXX/debug-info-incomplete-types.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: %clang_cc1 -debug-info-kind=limited -gomit-unreferenced-methods %s -emit-llvm -o - | FileCheck %s
-
-struct t1 {
-  void f1();
-  void f2();
-};
-
-void t1::f1() { }
-
-// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1"
-// CHECK-SAME: elements: [[ELEMENTS:![0-9]+]]
-// CHECK: [[ELEMENTS]] = !{}
diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c
index b09238d7b6bb66..7d061410a229f0 100644
--- a/clang/test/Driver/debug-options.c
+++ b/clang/test/Driver/debug-options.c
@@ -242,11 +242,6 @@
 // RUN: %clang -### -c %s 2>&1 | FileCheck -check-prefix=NORNGBSE %s
 // RUN: %clang -### -c -fdebug-ranges-base-address -fno-debug-ranges-base-address %s 2>&1 | FileCheck -check-prefix=NORNGBSE %s
 //
-// RUN: %clang -### -c -gomit-unreferenced-methods %s 2>&1 | FileCheck -check-prefix=INCTYPES %s
-// RUN: %clang -### -c %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
-// RUN: %clang -### -c -gomit-unreferenced-methods -fdebug-types-section %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
-// RUN: %clang -### -c -gomit-unreferenced-methods -fstandalone-debug %s 2>&1 | FileCheck -check-prefix=NOINCTYPES %s
-//
 // RUN: %clang -### -c -glldb %s 2>&1 | FileCheck -check-prefix=NOPUB %s
 // RUN: %clang -### -c -glldb -gno-pubnames %s 2>&1 | FileCheck -check-prefix=NOPUB %s
 //
@@ -386,9 +381,6 @@
 // RNGBSE: -fdebug-ranges-base-address
 // NORNGBSE-NOT: -fdebug-ranges-base-address
 //
-// INCTYPES: -gomit-unreferenced-methods
-// NOINCTYPES-NOT: -gomit-unreferenced-methods
-//
 // GARANGE-DAG: -generate-arange-section
 //
 // FDTS: "-mllvm" "-generate-type-units"

From 8890209ead2246461985f49c4c9c01cc2371ac09 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Wed, 29 May 2024 21:52:20 -0700
Subject: [PATCH 223/230] [HLSL] Default and Relaxed Availability Diagnostics
 (#92704)

Implements HLSL availability diagnostics' default and relaxed mode.

HLSL availability diagnostics emits errors or warning when unavailable
shader APIs are used. Unavailable shader APIs are APIs that are exposed
in HLSL code but are not available in the target shader stage or shader
model version.

In the default mode the compiler emits an error when an unavailable API
is found in a code that is reachable from the shader entry point
function. In the future this check will also extended to exported
library functions (#92073). The relaxed diagnostic mode is the same
except the compiler emits a warning. This mode is enabled by
``-Wno-error=hlsl-availability``.

See HLSL Availability Diagnostics design doc
[here](https://github.com/llvm/llvm-project/blob/main/clang/docs/HLSL/AvailabilityDiagnostics.rst)
for more details.

Fixes #90095
---
 clang/include/clang/Basic/Attr.td             |  45 ++-
 clang/include/clang/Basic/DiagnosticGroups.td |   3 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 +
 clang/include/clang/Sema/SemaHLSL.h           |   1 +
 clang/lib/AST/DeclBase.cpp                    |   3 +-
 clang/lib/Sema/Sema.cpp                       |   4 +
 clang/lib/Sema/SemaAvailability.cpp           |  24 +-
 clang/lib/Sema/SemaHLSL.cpp                   | 297 ++++++++++++++++++
 .../attr-availability-compute.hlsl            |  19 +-
 .../Availability/attr-availability-mesh.hlsl  |  19 +-
 .../Availability/attr-availability-pixel.hlsl |   6 +-
 .../avail-diag-default-compute.hlsl           | 119 +++++++
 .../Availability/avail-diag-default-lib.hlsl  | 130 ++++++++
 .../avail-diag-relaxed-compute.hlsl           | 119 +++++++
 .../Availability/avail-diag-relaxed-lib.hlsl  | 130 ++++++++
 .../avail-lib-multiple-stages.hlsl            |  57 ++++
 .../SemaHLSL/WaveBuiltinAvailability.hlsl     |   9 +-
 17 files changed, 941 insertions(+), 51 deletions(-)
 create mode 100644 clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index ef9df1e9d8b4aa..2665b7353ca4a5 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1060,18 +1060,10 @@ static llvm::StringRef canonicalizePlatformName(llvm::StringRef Platform) {
              .Case("ShaderModel", "shadermodel")
              .Default(Platform);
 }
-static llvm::StringRef getPrettyEnviromentName(llvm::StringRef Environment) {
-    return llvm::StringSwitch<llvm::StringRef>(Environment)
-             .Case("pixel", "pixel shader")
-             .Case("vertex", "vertex shader")
-             .Case("geometry", "geometry shader")
-             .Case("hull", "hull shader")
-             .Case("domain", "domain shader")
-             .Case("compute", "compute shader")
-             .Case("mesh", "mesh shader")
-             .Case("amplification", "amplification shader")
-             .Case("library", "shader library")
-             .Default(Environment);
+static llvm::StringRef getPrettyEnviromentName(llvm::Triple::EnvironmentType EnvironmentType) {
+  if (EnvironmentType >= llvm::Triple::Pixel && EnvironmentType <= llvm::Triple::Amplification)
+    return llvm::Triple::getEnvironmentTypeName(EnvironmentType);
+  return "";
 }
 static llvm::Triple::EnvironmentType getEnvironmentType(llvm::StringRef Environment) {
     return llvm::StringSwitch<llvm::Triple::EnvironmentType>(Environment)
@@ -1081,6 +1073,12 @@ static llvm::Triple::EnvironmentType getEnvironmentType(llvm::StringRef Environm
              .Case("hull", llvm::Triple::Hull)
              .Case("domain", llvm::Triple::Domain)
              .Case("compute", llvm::Triple::Compute)
+             .Case("raygeneration", llvm::Triple::RayGeneration)
+             .Case("intersection", llvm::Triple::Intersection)
+             .Case("anyhit", llvm::Triple::AnyHit)
+             .Case("closesthit", llvm::Triple::ClosestHit)
+             .Case("miss", llvm::Triple::Miss)
+             .Case("callable", llvm::Triple::Callable)
              .Case("mesh", llvm::Triple::Mesh)
              .Case("amplification", llvm::Triple::Amplification)
              .Case("library", llvm::Triple::Library)
@@ -4480,6 +4478,29 @@ def HLSLShader : InheritableAttr {
                   "Miss", "Callable", "Mesh", "Amplification"]>
   ];
   let Documentation = [HLSLSV_ShaderTypeAttrDocs];
+  let AdditionalMembers =
+[{
+  static const unsigned ShaderTypeMaxValue = (unsigned)HLSLShaderAttr::Amplification;
+
+  static llvm::Triple::EnvironmentType getTypeAsEnvironment(HLSLShaderAttr::ShaderType ShaderType) {
+    switch (ShaderType) {
+      case HLSLShaderAttr::Pixel:         return llvm::Triple::Pixel;
+      case HLSLShaderAttr::Vertex:        return llvm::Triple::Vertex;
+      case HLSLShaderAttr::Geometry:      return llvm::Triple::Geometry;
+      case HLSLShaderAttr::Hull:          return llvm::Triple::Hull;
+      case HLSLShaderAttr::Domain:        return llvm::Triple::Domain;
+      case HLSLShaderAttr::Compute:       return llvm::Triple::Compute;
+      case HLSLShaderAttr::RayGeneration: return llvm::Triple::RayGeneration;
+      case HLSLShaderAttr::Intersection:  return llvm::Triple::Intersection;
+      case HLSLShaderAttr::AnyHit:        return llvm::Triple::AnyHit;
+      case HLSLShaderAttr::ClosestHit:    return llvm::Triple::ClosestHit;
+      case HLSLShaderAttr::Miss:          return llvm::Triple::Miss;
+      case HLSLShaderAttr::Callable:      return llvm::Triple::Callable;
+      case HLSLShaderAttr::Mesh:          return llvm::Triple::Mesh;
+      case HLSLShaderAttr::Amplification: return llvm::Triple::Amplification;
+    }
+  }
+}];
 }
 
 def HLSLResource : InheritableAttr {
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 6b595a35679329..7d5ba7869ec340 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1517,6 +1517,9 @@ def HLSLMixPackOffset : DiagGroup<"mix-packoffset">;
 // Warnings for DXIL validation
 def DXILValidation : DiagGroup<"dxil-validation">;
 
+// Warning for HLSL API availability
+def HLSLAvailability : DiagGroup<"hlsl-availability">;
+
 // Warnings and notes related to const_var_decl_type attribute checks
 def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f15cba63624ea6..e34eb692941b4f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12239,6 +12239,13 @@ def err_hlsl_param_qualifier_mismatch :
 def warn_hlsl_impcast_vector_truncation : Warning<
   "implicit conversion truncates vector: %0 to %1">, InGroup<Conversion>;
 
+def warn_hlsl_availability : Warning<
+  "%0 is only available %select{|in %4 environment }3on %1 %2 or newer">,
+  InGroup<HLSLAvailability>, DefaultError;
+def warn_hlsl_availability_unavailable :
+  Warning<err_unavailable.Summary>,
+  InGroup<HLSLAvailability>, DefaultError;
+
 // Layout randomization diagnostics.
 def err_non_designated_init_used : Error<
   "a randomized struct can only be initialized with a designated initializer">;
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 34acaf19517f2a..eac1f7c07c85de 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -49,6 +49,7 @@ class SemaHLSL : public SemaBase {
   void DiagnoseAttrStageMismatch(
       const Attr *A, HLSLShaderAttr::ShaderType Stage,
       std::initializer_list<HLSLShaderAttr::ShaderType> AllowedStages);
+  void DiagnoseAvailabilityViolations(TranslationUnitDecl *TU);
 };
 
 } // namespace clang
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 65d5eeb6354eba..ffb22194bce529 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -669,7 +669,8 @@ static AvailabilityResult CheckAvailability(ASTContext &Context,
     IdentifierInfo *IIEnv = A->getEnvironment();
     StringRef TargetEnv =
         Context.getTargetInfo().getTriple().getEnvironmentName();
-    StringRef EnvName = AvailabilityAttr::getPrettyEnviromentName(TargetEnv);
+    StringRef EnvName = AvailabilityAttr::getPrettyEnviromentName(
+        Context.getTargetInfo().getTriple().getEnvironment());
     // Matching environment or no environment on attribute
     if (!IIEnv || (!TargetEnv.empty() && IIEnv->getName() == TargetEnv)) {
       if (Message) {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index d1fb21bb1ae1d6..39a9a431728ff7 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1357,6 +1357,10 @@ void Sema::ActOnEndOfTranslationUnit() {
     Consumer.CompleteExternalDeclaration(D);
   }
 
+  if (LangOpts.HLSL)
+    HLSL().DiagnoseAvailabilityViolations(
+        getASTContext().getTranslationUnitDecl());
+
   // If there were errors, disable 'unused' warnings since they will mostly be
   // noise. Don't warn for a use from a module: either we should warn on all
   // file-scope declarations in modules or not at all, but whether the
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 22f5a2f6634778..330cd602297d46 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/DelayedDiagnostic.h"
@@ -228,8 +229,9 @@ shouldDiagnoseAvailabilityByDefault(const ASTContext &Context,
     ForceAvailabilityFromVersion = VersionTuple(/*Major=*/10, /*Minor=*/13);
     break;
   case llvm::Triple::ShaderModel:
-    // Always enable availability diagnostics for shader models.
-    return true;
+    // FIXME: This will be updated when HLSL strict diagnostic mode
+    // is implemented (issue #90096)
+    return false;
   default:
     // New targets should always warn about availability.
     return Triple.getVendor() == llvm::Triple::Apple;
@@ -409,10 +411,11 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     std::string PlatformName(
         AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
     llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
-        TI.getTriple().getEnvironmentName()));
+        TI.getTriple().getEnvironment()));
     llvm::StringRef AttrEnvironment =
         AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
-                                   AA->getEnvironment()->getName())
+                                   AvailabilityAttr::getEnvironmentType(
+                                       AA->getEnvironment()->getName()))
                              : "";
     bool UseEnvironment =
         (!AttrEnvironment.empty() && !TargetEnvironment.empty());
@@ -438,6 +441,10 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
         << S.Context.getTargetInfo().getPlatformMinVersion().getAsString()
         << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
+    // Do not offer to silence the warning or fixits for HLSL
+    if (S.getLangOpts().HLSL)
+      return;
+
     if (const auto *Enclosing = findEnclosingDeclToAnnotate(Ctx)) {
       if (const auto *TD = dyn_cast<TagDecl>(Enclosing))
         if (TD->getDeclName().isEmpty()) {
@@ -839,10 +846,11 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
     std::string PlatformName(
         AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
     llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
-        TI.getTriple().getEnvironmentName()));
+        TI.getTriple().getEnvironment()));
     llvm::StringRef AttrEnvironment =
         AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
-                                   AA->getEnvironment()->getName())
+                                   AvailabilityAttr::getEnvironmentType(
+                                       AA->getEnvironment()->getName()))
                              : "";
     bool UseEnvironment =
         (!AttrEnvironment.empty() && !TargetEnvironment.empty());
@@ -865,6 +873,10 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
         << SemaRef.Context.getTargetInfo().getPlatformMinVersion().getAsString()
         << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
+    // Do not offer to silence the warning or fixits for HLSL
+    if (SemaRef.getLangOpts().HLSL)
+      return;
+
     auto FixitDiag =
         SemaRef.Diag(Range.getBegin(), diag::note_unguarded_available_silence)
         << Range << D
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 6a12c417e2f3a4..9e614ae99f37d2 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -9,6 +9,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaHLSL.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TargetInfo.h"
@@ -16,6 +19,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <iterator>
@@ -290,3 +294,296 @@ void SemaHLSL::DiagnoseAttrStageMismatch(
       << A << HLSLShaderAttr::ConvertShaderTypeToStr(Stage)
       << (AllowedStages.size() != 1) << join(StageStrings, ", ");
 }
+
+namespace {
+
+/// This class implements HLSL availability diagnostics for default
+/// and relaxed mode
+///
+/// The goal of this diagnostic is to emit an error or warning when an
+/// unavailable API is found in code that is reachable from the shader
+/// entry function or from an exported function (when compiling a shader
+/// library).
+///
+/// This is done by traversing the AST of all shader entry point functions
+/// and of all exported functions, and any functions that are refrenced
+/// from this AST. In other words, any functions that are reachable from
+/// the entry points.
+class DiagnoseHLSLAvailability
+    : public RecursiveASTVisitor<DiagnoseHLSLAvailability> {
+
+  Sema &SemaRef;
+
+  // Stack of functions to be scaned
+  llvm::SmallVector<const FunctionDecl *, 8> DeclsToScan;
+
+  // Tracks which environments functions have been scanned in.
+  //
+  // Maps FunctionDecl to an unsigned number that represents the set of shader
+  // environments the function has been scanned for.
+  // Since HLSLShaderAttr::ShaderType enum is generated from Attr.td and is
+  // defined without any assigned values, it is guaranteed to be numbered
+  // sequentially from 0 up and we can use it to 'index' individual bits
+  // in the set.
+  // The N'th bit in the set will be set if the function has been scanned
+  // in shader environment whose ShaderType integer value equals N.
+  // For example, if a function has been scanned in compute and pixel stage
+  // environment, the value will be 0x21 (100001 binary) because
+  // (int)HLSLShaderAttr::ShaderType::Pixel == 1 and
+  // (int)HLSLShaderAttr::ShaderType::Compute == 5.
+  // A FunctionDecl is mapped to 0 (or not included in the map) if it has not
+  // been scanned in any environment.
+  llvm::DenseMap<const FunctionDecl *, unsigned> ScannedDecls;
+
+  // Do not access these directly, use the get/set methods below to make
+  // sure the values are in sync
+  llvm::Triple::EnvironmentType CurrentShaderEnvironment;
+  unsigned CurrentShaderStageBit;
+
+  // True if scanning a function that was already scanned in a different
+  // shader stage context, and therefore we should not report issues that
+  // depend only on shader model version because they would be duplicate.
+  bool ReportOnlyShaderStageIssues;
+
+  // Helper methods for dealing with current stage context / environment
+  void SetShaderStageContext(HLSLShaderAttr::ShaderType ShaderType) {
+    static_assert(sizeof(unsigned) >= 4);
+    assert((unsigned)ShaderType < 31); // 31 is reserved for "unknown"
+
+    CurrentShaderEnvironment = HLSLShaderAttr::getTypeAsEnvironment(ShaderType);
+    CurrentShaderStageBit = (1 << ShaderType);
+  }
+
+  void SetUnknownShaderStageContext() {
+    CurrentShaderEnvironment = llvm::Triple::UnknownEnvironment;
+    CurrentShaderStageBit = (1 << 31);
+  }
+
+  llvm::Triple::EnvironmentType GetCurrentShaderEnvironment() const {
+    return CurrentShaderEnvironment;
+  }
+
+  bool InUnknownShaderStageContext() const {
+    return CurrentShaderEnvironment == llvm::Triple::UnknownEnvironment;
+  }
+
+  // Helper methods for dealing with shader stage bitmap
+  void AddToScannedFunctions(const FunctionDecl *FD) {
+    unsigned &ScannedStages = ScannedDecls.getOrInsertDefault(FD);
+    ScannedStages |= CurrentShaderStageBit;
+  }
+
+  unsigned GetScannedStages(const FunctionDecl *FD) {
+    return ScannedDecls.getOrInsertDefault(FD);
+  }
+
+  bool WasAlreadyScannedInCurrentStage(const FunctionDecl *FD) {
+    return WasAlreadyScannedInCurrentStage(GetScannedStages(FD));
+  }
+
+  bool WasAlreadyScannedInCurrentStage(unsigned ScannerStages) {
+    return ScannerStages & CurrentShaderStageBit;
+  }
+
+  static bool NeverBeenScanned(unsigned ScannedStages) {
+    return ScannedStages == 0;
+  }
+
+  // Scanning methods
+  void HandleFunctionOrMethodRef(FunctionDecl *FD, Expr *RefExpr);
+  void CheckDeclAvailability(NamedDecl *D, const AvailabilityAttr *AA,
+                             SourceRange Range);
+  const AvailabilityAttr *FindAvailabilityAttr(const Decl *D);
+  bool HasMatchingEnvironmentOrNone(const AvailabilityAttr *AA);
+
+public:
+  DiagnoseHLSLAvailability(Sema &SemaRef) : SemaRef(SemaRef) {}
+
+  // AST traversal methods
+  void RunOnTranslationUnit(const TranslationUnitDecl *TU);
+  void RunOnFunction(const FunctionDecl *FD);
+
+  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
+    FunctionDecl *FD = llvm::dyn_cast<FunctionDecl>(DRE->getDecl());
+    if (FD)
+      HandleFunctionOrMethodRef(FD, DRE);
+    return true;
+  }
+
+  bool VisitMemberExpr(MemberExpr *ME) {
+    FunctionDecl *FD = llvm::dyn_cast<FunctionDecl>(ME->getMemberDecl());
+    if (FD)
+      HandleFunctionOrMethodRef(FD, ME);
+    return true;
+  }
+};
+
+void DiagnoseHLSLAvailability::HandleFunctionOrMethodRef(FunctionDecl *FD,
+                                                         Expr *RefExpr) {
+  assert((isa<DeclRefExpr>(RefExpr) || isa<MemberExpr>(RefExpr)) &&
+         "expected DeclRefExpr or MemberExpr");
+
+  // has a definition -> add to stack to be scanned
+  const FunctionDecl *FDWithBody = nullptr;
+  if (FD->hasBody(FDWithBody)) {
+    if (!WasAlreadyScannedInCurrentStage(FDWithBody))
+      DeclsToScan.push_back(FDWithBody);
+    return;
+  }
+
+  // no body -> diagnose availability
+  const AvailabilityAttr *AA = FindAvailabilityAttr(FD);
+  if (AA)
+    CheckDeclAvailability(
+        FD, AA, SourceRange(RefExpr->getBeginLoc(), RefExpr->getEndLoc()));
+}
+
+void DiagnoseHLSLAvailability::RunOnTranslationUnit(
+    const TranslationUnitDecl *TU) {
+  // Iterate over all shader entry functions and library exports, and for those
+  // that have a body (definiton), run diag scan on each, setting appropriate
+  // shader environment context based on whether it is a shader entry function
+  // or an exported function.
+  for (auto &D : TU->decls()) {
+    const FunctionDecl *FD = llvm::dyn_cast<FunctionDecl>(D);
+    if (!FD || !FD->isThisDeclarationADefinition())
+      continue;
+
+    // shader entry point
+    auto ShaderAttr = FD->getAttr<HLSLShaderAttr>();
+    if (ShaderAttr) {
+      SetShaderStageContext(ShaderAttr->getType());
+      RunOnFunction(FD);
+      continue;
+    }
+    // exported library function with definition
+    // FIXME: tracking issue #92073
+#if 0
+    if (FD->getFormalLinkage() == Linkage::External) {
+      SetUnknownShaderStageContext();
+      RunOnFunction(FD);
+    }
+#endif
+  }
+}
+
+void DiagnoseHLSLAvailability::RunOnFunction(const FunctionDecl *FD) {
+  assert(DeclsToScan.empty() && "DeclsToScan should be empty");
+  DeclsToScan.push_back(FD);
+
+  while (!DeclsToScan.empty()) {
+    // Take one decl from the stack and check it by traversing its AST.
+    // For any CallExpr found during the traversal add it's callee to the top of
+    // the stack to be processed next. Functions already processed are stored in
+    // ScannedDecls.
+    const FunctionDecl *FD = DeclsToScan.back();
+    DeclsToScan.pop_back();
+
+    // Decl was already scanned
+    const unsigned ScannedStages = GetScannedStages(FD);
+    if (WasAlreadyScannedInCurrentStage(ScannedStages))
+      continue;
+
+    ReportOnlyShaderStageIssues = !NeverBeenScanned(ScannedStages);
+
+    AddToScannedFunctions(FD);
+    TraverseStmt(FD->getBody());
+  }
+}
+
+bool DiagnoseHLSLAvailability::HasMatchingEnvironmentOrNone(
+    const AvailabilityAttr *AA) {
+  IdentifierInfo *IIEnvironment = AA->getEnvironment();
+  if (!IIEnvironment)
+    return true;
+
+  llvm::Triple::EnvironmentType CurrentEnv = GetCurrentShaderEnvironment();
+  if (CurrentEnv == llvm::Triple::UnknownEnvironment)
+    return false;
+
+  llvm::Triple::EnvironmentType AttrEnv =
+      AvailabilityAttr::getEnvironmentType(IIEnvironment->getName());
+
+  return CurrentEnv == AttrEnv;
+}
+
+const AvailabilityAttr *
+DiagnoseHLSLAvailability::FindAvailabilityAttr(const Decl *D) {
+  AvailabilityAttr const *PartialMatch = nullptr;
+  // Check each AvailabilityAttr to find the one for this platform.
+  // For multiple attributes with the same platform try to find one for this
+  // environment.
+  for (const auto *A : D->attrs()) {
+    if (const auto *Avail = dyn_cast<AvailabilityAttr>(A)) {
+      StringRef AttrPlatform = Avail->getPlatform()->getName();
+      StringRef TargetPlatform =
+          SemaRef.getASTContext().getTargetInfo().getPlatformName();
+
+      // Match the platform name.
+      if (AttrPlatform == TargetPlatform) {
+        // Find the best matching attribute for this environment
+        if (HasMatchingEnvironmentOrNone(Avail))
+          return Avail;
+        PartialMatch = Avail;
+      }
+    }
+  }
+  return PartialMatch;
+}
+
+// Check availability against target shader model version and current shader
+// stage and emit diagnostic
+void DiagnoseHLSLAvailability::CheckDeclAvailability(NamedDecl *D,
+                                                     const AvailabilityAttr *AA,
+                                                     SourceRange Range) {
+  if (ReportOnlyShaderStageIssues && !AA->getEnvironment())
+    return;
+
+  bool EnvironmentMatches = HasMatchingEnvironmentOrNone(AA);
+  VersionTuple Introduced = AA->getIntroduced();
+  VersionTuple TargetVersion =
+      SemaRef.Context.getTargetInfo().getPlatformMinVersion();
+
+  if (TargetVersion >= Introduced && EnvironmentMatches)
+    return;
+
+  // Do not diagnose shade-stage-specific availability when the shader stage
+  // context is unknown
+  if (InUnknownShaderStageContext() && AA->getEnvironment() != nullptr)
+    return;
+
+  // Emit diagnostic message
+  const TargetInfo &TI = SemaRef.getASTContext().getTargetInfo();
+  llvm::StringRef PlatformName(
+      AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
+
+  llvm::StringRef CurrentEnvStr =
+      AvailabilityAttr::getPrettyEnviromentName(GetCurrentShaderEnvironment());
+
+  llvm::StringRef AttrEnvStr = AA->getEnvironment()
+                                   ? AvailabilityAttr::getPrettyEnviromentName(
+                                         AvailabilityAttr::getEnvironmentType(
+                                             AA->getEnvironment()->getName()))
+                                   : "";
+  bool UseEnvironment = !AttrEnvStr.empty();
+
+  if (EnvironmentMatches) {
+    SemaRef.Diag(Range.getBegin(), diag::warn_hlsl_availability)
+        << Range << D << PlatformName << Introduced.getAsString()
+        << UseEnvironment << CurrentEnvStr;
+  } else {
+    SemaRef.Diag(Range.getBegin(), diag::warn_hlsl_availability_unavailable)
+        << Range << D;
+  }
+
+  SemaRef.Diag(D->getLocation(), diag::note_partial_availability_specified_here)
+      << D << PlatformName << Introduced.getAsString()
+      << SemaRef.Context.getTargetInfo().getPlatformMinVersion().getAsString()
+      << UseEnvironment << AttrEnvStr << CurrentEnvStr;
+}
+
+} // namespace
+
+void SemaHLSL::DiagnoseAvailabilityViolations(TranslationUnitDecl *TU) {
+  DiagnoseHLSLAvailability(SemaRef).RunOnTranslationUnit(TU);
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
index 8fa696ea116498..2f488a8d7c3571 100644
--- a/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
+++ b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
@@ -38,33 +38,28 @@ unsigned f8();
 
 [numthreads(4,1,1)]
 int main() {
-    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-error@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
     // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
     unsigned A = f1(); // #f1_call
 
-    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-error@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
     // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
     unsigned B = f2(); // #f2_call
 
     unsigned C = f3();
 
-    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-error@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
     // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
     unsigned D = f4(); // #f4_call
 
     unsigned E = f5();
 
-    // expected-warning@#f6_call {{'f6' is only available in compute shader environment on Shader Model 6.0 or newer}}
-    // expected-note@#f6 {{'f6' has been marked as being introduced in Shader Model 6.0 in compute shader environment here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f6_call {{enclose 'f6' in a __builtin_available check to silence this warning}}
+    // expected-error@#f6_call {{'f6' is only available in compute environment on Shader Model 6.0 or newer}}
+    // expected-note@#f6 {{'f6' has been marked as being introduced in Shader Model 6.0 in compute environment here, but the deployment target is Shader Model 5.0}}
     unsigned F = f6(); // #f6_call
 
-    // expected-warning@#f7_call {{'f7' is unavailable}}
-    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 compute shader environment}}
-    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    // expected-error@#f7_call {{'f7' is unavailable}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh environment here, but the deployment target is Shader Model 5.0 compute environment}}
     unsigned G = f7(); // #f7_call
 
     unsigned H = f8();
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
index 40a7ddbb1de988..07da116d403ce7 100644
--- a/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
+++ b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
@@ -38,35 +38,30 @@ unsigned f8(); // #f8
 
 [numthreads(4,1,1)]
 int main() {
-    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-error@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
     // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
     unsigned A = f1(); // #f1_call
 
-    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-error@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
     // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
     unsigned B = f2(); // #f2_call
 
     unsigned C = f3();
 
-    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-error@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
     // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
     unsigned D = f4(); // #f4_call
 
     unsigned E = f5(); // #f5_call
 
     unsigned F = f6(); // #f6_call
 
-    // expected-warning@#f7_call {{'f7' is only available in mesh shader environment on Shader Model 6.0 or newer}}
-    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
-    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    // expected-error@#f7_call {{'f7' is only available in mesh environment on Shader Model 6.0 or newer}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh environment here, but the deployment target is Shader Model 5.0 mesh environment}}
     unsigned G = f7(); // #f7_call
 
-    // expected-warning@#f8_call {{'f8' is only available in mesh shader environment on Shader Model 6.0 or newer}}
-    // expected-note@#f8 {{'f8' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
-    // expected-note@#f8_call {{enclose 'f8' in a __builtin_available check to silence this warning}}
+    // expected-error@#f8_call {{'f8' is only available in mesh environment on Shader Model 6.0 or newer}}
+    // expected-note@#f8 {{'f8' has been marked as being introduced in Shader Model 6.0 in mesh environment here, but the deployment target is Shader Model 5.0 mesh environment}}
     unsigned H = f8(); // #f8_call
 
     return 0;
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
index 59d09a9cd276f9..7cd13e653ed5a3 100644
--- a/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
+++ b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
@@ -37,14 +37,12 @@ __attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
 unsigned f8();
 
 int main() {
-    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-error@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
     // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
     unsigned A = f1(); // #f1_call
 
-    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-error@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
     // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
     unsigned B = f2(); // #f2_call
 
     unsigned C = f3();
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
new file mode 100644
index 00000000000000..764b9e843f7f1c
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
@@ -0,0 +1,119 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
new file mode 100644
index 00000000000000..515e4c5f9df03d
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+// Shader entry point without body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main();
+
+// Shader entry point with body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
new file mode 100644
index 00000000000000..65836c55821d77
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
@@ -0,0 +1,119 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
+// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+  return 0;
+}
+
+float alive(float f) {
+  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
new file mode 100644
index 00000000000000..6bd20450f8bfa4
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  
+  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+
+  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+
+  return 0;
+}
+
+float alive(float f) {
+  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+
+  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+
+  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+// Shader entry point without body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main();
+
+// Shader entry point with body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl
new file mode 100644
index 00000000000000..b56ab8fe4526ba
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+float fz(float); // #fz
+
+
+void F(float f) {
+  // Make sure we only get this error once, even though this function is scanned twice - once
+  // in compute shader context and once in pixel shader context.
+  // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #fx_call
+  
+  // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #fy_call
+
+  // expected-error@#fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}}
+  float X = fz(f); // #fz_call
+}
+
+void deadCode(float f) {
+  // no diagnostics expected under default diagnostic mode
+  float A = fx(f);
+  float B = fy(f);
+  float X = fz(f);
+}
+
+// Pixel shader
+[shader("pixel")]
+void mainPixel() {
+  F(1.0);
+}
+
+// First Compute shader
+[shader("compute")]
+[numthreads(4,1,1)]
+void mainCompute1() {
+  F(2.0);
+}
+
+// Second compute shader to make sure we do not get duplicate messages if F is called
+// from multiple entry points.
+[shader("compute")]
+[numthreads(4,1,1)]
+void mainCompute2() {
+  F(3.0);
+}
diff --git a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
index 185b79be37be5b..6333c635693272 100644
--- a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
+++ b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
@@ -1,9 +1,10 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel5.0-library -verify %s
 // WaveActiveCountBits is unavailable before ShaderModel 6.0.
 
-unsigned foo(bool b) {
-    // expected-warning@#site {{'WaveActiveCountBits' is only available on Shader Model 6.0 or newer}}
+[shader("compute")]
+[numthreads(8,8,1)]
+unsigned foo() {
+    // expected-error@#site {{'WaveActiveCountBits' is only available on Shader Model 6.0 or newer}}
     // expected-note@hlsl/hlsl_intrinsics.h:* {{'WaveActiveCountBits' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
-    // expected-note@#site {{enclose 'WaveActiveCountBits' in a __builtin_available check to silence this warning}}
-    return hlsl::WaveActiveCountBits(b); // #site
+    return hlsl::WaveActiveCountBits(1); // #site
 }

From 7d4a45d98275e669bda40410f064891beb3480ce Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 21:55:39 -0700
Subject: [PATCH 224/230] Revert "Add option to generate additional debug info
 for expression dereferencing pointer to pointers. (#81545)"

This reverts commit aeccfee348c717165541d8d895b9b0cdfe31415c, and dependents:

Revert "[NFC] Fix PPC buildbot failure https://lab.llvm.org/buildbot/#/builders/230/builds/29066"
This reverts commit 2b1d1c51f6e321267cc86e9db7808298c59caf0e.

Revert "Fix test - remove unnecessary/incorrect `-S`, in favor of `-emit-llvm`"
This reverts commit ea1ecb50fa831583241fc531153bd2c072955d29.

The test is failing on MacOs and Windows
---
 clang/lib/CodeGen/CGDebugInfo.cpp             |  84 ------------
 clang/lib/CodeGen/CGDebugInfo.h               |   6 -
 clang/lib/CodeGen/CGExprScalar.cpp            |  21 +--
 .../test/CodeGenCXX/debug-info-ptr-to-ptr.cpp | 120 ------------------
 4 files changed, 1 insertion(+), 230 deletions(-)
 delete mode 100644 clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 9d7107abf8a6fe..fac278f0e20a43 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -5737,90 +5737,6 @@ void CGDebugInfo::EmitExternalVariable(llvm::GlobalVariable *Var,
   Var->addDebugInfo(GVE);
 }
 
-void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
-                                     llvm::Instruction *Value, QualType Ty) {
-  // Only when -g2 or above is specified, debug info for variables will be
-  // generated.
-  if (CGM.getCodeGenOpts().getDebugInfo() <=
-      llvm::codegenoptions::DebugLineTablesOnly)
-    return;
-
-  llvm::DebugLoc SaveDebugLoc = Builder.getCurrentDebugLocation();
-  if (!SaveDebugLoc.get())
-    return;
-
-  llvm::DIFile *Unit = SaveDebugLoc->getFile();
-  llvm::DIType *Type = getOrCreateType(Ty, Unit);
-
-  // Check if Value is already a declared variable and has debug info, in this
-  // case we have nothing to do. Clang emits declared variable as alloca, and
-  // it is loaded upon use, so we identify such pattern here.
-  if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Value)) {
-    llvm::Value *Var = Load->getPointerOperand();
-    if (llvm::Metadata *MDValue = llvm::ValueAsMetadata::getIfExists(Var)) {
-      if (llvm::Value *DbgValue = llvm::MetadataAsValue::getIfExists(
-              CGM.getLLVMContext(), MDValue)) {
-        for (llvm::User *U : DbgValue->users()) {
-          if (llvm::CallInst *DbgDeclare = dyn_cast<llvm::CallInst>(U)) {
-            if (DbgDeclare->getCalledFunction()->getIntrinsicID() ==
-                    llvm::Intrinsic::dbg_declare &&
-                DbgDeclare->getArgOperand(0) == DbgValue) {
-              // There can be implicit type cast applied on a variable if it is
-              // an opaque ptr, in this case its debug info may not match the
-              // actual type of object being used as in the next instruction, so
-              // we will need to emit a pseudo variable for type-casted value.
-              llvm::DILocalVariable *MDNode = cast<llvm::DILocalVariable>(
-                  cast<llvm::MetadataAsValue>(DbgDeclare->getOperand(1))
-                      ->getMetadata());
-              if (MDNode->getType() == Type)
-                return;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Find the correct location to insert a sequence of instructions to
-  // materialize Value on the stack.
-  auto SaveInsertionPoint = Builder.saveIP();
-  if (llvm::InvokeInst *Invoke = dyn_cast<llvm::InvokeInst>(Value))
-    Builder.SetInsertPoint(Invoke->getNormalDest()->begin());
-  else if (llvm::Instruction *Next = Value->getIterator()->getNextNode())
-    Builder.SetInsertPoint(Next);
-  else
-    Builder.SetInsertPoint(Value->getParent());
-  llvm::DebugLoc DL = Value->getDebugLoc();
-  if (DL.get())
-    Builder.SetCurrentDebugLocation(DL);
-  else if (!Builder.getCurrentDebugLocation().get())
-    Builder.SetCurrentDebugLocation(SaveDebugLoc);
-
-  llvm::AllocaInst *PseudoVar = Builder.CreateAlloca(Value->getType());
-  Address PseudoVarAddr(PseudoVar, Value->getType(),
-                        CharUnits::fromQuantity(PseudoVar->getAlign()));
-  llvm::LoadInst *Load = Builder.CreateLoad(PseudoVarAddr);
-  Value->replaceAllUsesWith(Load);
-  Builder.SetInsertPoint(Load);
-  Builder.CreateStore(Value, PseudoVarAddr);
-
-  // Emit debug info for materialized Value.
-  unsigned Line = Builder.getCurrentDebugLocation().getLine();
-  unsigned Column = Builder.getCurrentDebugLocation().getCol();
-  llvm::DILocalVariable *D = DBuilder.createAutoVariable(
-      LexicalBlockStack.back(), "", nullptr, 0, Type, false,
-      llvm::DINode::FlagArtificial);
-  llvm::DILocation *DIL =
-      llvm::DILocation::get(CGM.getLLVMContext(), Line, Column,
-                            LexicalBlockStack.back(), CurInlinedAt);
-  SmallVector<uint64_t> Expr;
-  DBuilder.insertDeclare(PseudoVar, D, DBuilder.createExpression(Expr), DIL,
-                         Load);
-
-  Builder.restoreIP(SaveInsertionPoint);
-  Builder.SetCurrentDebugLocation(SaveDebugLoc);
-}
-
 void CGDebugInfo::EmitGlobalAlias(const llvm::GlobalValue *GV,
                                   const GlobalDecl GD) {
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 614316f3fc7fd8..d6db4d711366ac 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -529,12 +529,6 @@ class CGDebugInfo {
   /// Emit information about an external variable.
   void EmitExternalVariable(llvm::GlobalVariable *GV, const VarDecl *Decl);
 
-  /// Emit a pseudo variable and debug info for an intermediate value if it does
-  /// not correspond to a variable in the source code, so that a profiler can
-  /// track more accurate usage of certain instructions of interest.
-  void EmitPseudoVariable(CGBuilderTy &Builder, llvm::Instruction *Value,
-                          QualType Ty);
-
   /// Emit information about global variable alias.
   void EmitGlobalAlias(const llvm::GlobalValue *GV, const GlobalDecl Decl);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 58f0a3113b4f81..1b144c178ce960 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1937,26 +1937,7 @@ Value *ScalarExprEmitter::VisitMemberExpr(MemberExpr *E) {
     }
   }
 
-  llvm::Value *Result = EmitLoadOfLValue(E);
-
-  // If -fdebug-info-for-profiling is specified, emit a pseudo variable and its
-  // debug info for the pointer, even if there is no variable associated with
-  // the pointer's expression.
-  if (CGF.CGM.getCodeGenOpts().DebugInfoForProfiling && CGF.getDebugInfo()) {
-    if (llvm::LoadInst *Load = dyn_cast<llvm::LoadInst>(Result)) {
-      if (llvm::GetElementPtrInst *GEP =
-              dyn_cast<llvm::GetElementPtrInst>(Load->getPointerOperand())) {
-        if (llvm::Instruction *Pointer =
-                dyn_cast<llvm::Instruction>(GEP->getPointerOperand())) {
-          QualType Ty = E->getBase()->getType();
-          if (!E->isArrow())
-            Ty = CGF.getContext().getPointerType(Ty);
-          CGF.getDebugInfo()->EmitPseudoVariable(Builder, Pointer, Ty);
-        }
-      }
-    }
-  }
-  return Result;
+  return EmitLoadOfLValue(E);
 }
 
 Value *ScalarExprEmitter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
diff --git a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp b/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
deleted file mode 100644
index 9f2a3f9e69197b..00000000000000
--- a/clang/test/CodeGenCXX/debug-info-ptr-to-ptr.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// Test debug info for intermediate value of a chained pointer deferencing
-// expression when the flag -fdebug-info-for-pointer-type is enabled.
-// RUN: %clang_cc1 %s -fdebug-info-for-profiling -debug-info-kind=constructor -emit-llvm -o - | FileCheck %s
-
-class A {
-public:
-  int i;
-  char c;
-  void *p;
-  int arr[3];
-};
-
-class B {
-public:
-  A* a;
-};
-
-class C {
-public:
-  B* b;
-  A* a;
-  A arr[10];
-};
-
-// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func1{{.*}}(
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.B, ptr {{%.*}}, i32 0, i32 0, !dbg [[DBG1:![0-9]+]]
-// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr, align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META1:![0-9]+]], metadata !DIExpression()), !dbg [[DBG1]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]], align {{.*}}, !dbg [[DBG1]]
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
-int func1(B *b) {
-  return b->a->i;
-}
-
-// Should generate a pseudo variable when pointer is type-casted.
-// CHECK-LABEL: define dso_local noundef ptr @{{.*}}func2{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META2:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[B_ADDR]],
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[B]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.B, ptr [[TMP1]], i32 0,
-A* func2(void *b) {
-  return ((B*)b)->a;
-}
-
-// Should not generate pseudo variable in this case.
-// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func3{{.*}}(
-// CHECK:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR:%.*]], metadata [[META4:![0-9]+]], metadata !DIExpression())
-// CHECK:    call void @llvm.dbg.declare(metadata ptr [[LOCAL1:%.*]], metadata [[META5:![0-9]+]], metadata !DIExpression())
-// CHECK-NOT: call void @llvm.dbg.declare(metadata ptr
-int func3(B *b) {
-  A *local1 = b->a;
-  return local1->i;
-}
-
-// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func4{{.*}}(
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.C, ptr {{%.*}}, i32 0, i32 1
-// CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[A_ADDR]],
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[A]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META6:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 0,
-// CHECK:         [[CALL:%.*]] = call noundef ptr @{{.*}}foo{{.*}}(
-// CHECK-NEXT:    [[PSEUDO2:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[CALL]], ptr [[PSEUDO2]]
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO2]], metadata [[META6]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PSEUDO2]]
-// CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds %class.A, ptr [[TMP2]], i32 0, i32 1
-char func4(C *c) {
-  extern A* foo(int x);
-  return foo(c->a->i)->c;
-}
-
-// CHECK-LABEL: define dso_local noundef signext i8 @{{.*}}func5{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META7:![0-9]+]], metadata !DIExpression())
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META8:![0-9]+]], metadata !DIExpression())
-// CHECK:         [[A_ADDR:%.*]] = getelementptr inbounds %class.A, ptr {{%.*}}, i64 {{%.*}},
-// CHECK-NEXT:    [[PSEUDO1:%.*]] = alloca ptr,
-// CHECK-NEXT:    store ptr [[A_ADDR]], ptr [[PSEUDO1]],
-// CHECK-NEXT:    call void @llvm.dbg.declare(metadata ptr [[PSEUDO1]], metadata [[META9:![0-9]+]], metadata !DIExpression())
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PSEUDO1]],
-// CHECK-NEXT:    {{%.*}} = getelementptr inbounds %class.A, ptr [[TMP1]], i32 0, i32 1,
-char func5(void *arr, int n) {
-  return ((A*)arr)[n].c;
-}
-
-// CHECK-LABEL: define dso_local noundef{{.*}}i32 @{{.*}}func6{{.*}}(
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META10:![0-9]+]], metadata !DIExpression())
-// CHECK:         call void @llvm.dbg.declare(metadata ptr {{%.*}}, metadata [[META11:![0-9]+]], metadata !DIExpression())
-int func6(B &b) {
-  return reinterpret_cast<A&>(b).i;
-}
-
-// CHECK-DAG: [[META_A:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A",
-// CHECK-DAG: [[META_AP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_A]],
-// CHECK-DAG: [[META_B:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B",
-// CHECK-DAG: [[META_BP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_B]],
-// CHECK-DAG: [[META_C:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C",
-// CHECK-DAG: [[META_CP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META_C]],
-// CHECK-DAG: [[META_VP:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null,
-// CHECK-DAG: [[META_I32:![0-9]+]] = !DIBasicType(name: "int", size: 32,
-// CHECK-DAG: [[META_BR:![0-9]+]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[META_B]],
-
-// CHECK-DAG: [[DBG1]] = !DILocation(line: 34, column: 13,
-// CHECK-DAG: [[META1]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META2]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 46, type: [[META_VP]])
-// CHECK-DAG: [[META3]] = !DILocalVariable(scope: {{.*}}, type: [[META_BP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META4]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 55, type: [[META_BP]])
-// CHECK-DAG: [[META5]] = !DILocalVariable(name: "local1", scope: {{.*}}, file: {{.*}}, line: 56, type: [[META_AP]])
-// CHECK-DAG: [[META6]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META7]] = !DILocalVariable(name: "arr", arg: 1, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_VP]])
-// CHECK-DAG: [[META8]] = !DILocalVariable(name: "n", arg: 2, scope: {{.*}}, file: {{.*}}, line: 88, type: [[META_I32]])
-// CHECK-DAG: [[META9]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)
-// CHECK-DAG: [[META10]] = !DILocalVariable(name: "b", arg: 1, scope: {{.*}}, file: {{.*}}, line: 95, type: [[META_BR]])
-// CHECK-DAG: [[META11]] = !DILocalVariable(scope: {{.*}}, type: [[META_AP]], flags: DIFlagArtificial)

From 89801c74c3e25f5a1eaa3999863be398f6a82abb Mon Sep 17 00:00:00 2001
From: Bimo <rui.xu@intel.com>
Date: Thu, 30 May 2024 13:01:40 +0800
Subject: [PATCH 225/230] [MLIR][Python] add ctype python binding support for
 bf16 (#92489)

Since bf16 is supported by mlir, similar to
complex128/complex64/float16, we need an implementation of bf16 ctype in
Python binding. Furthermore, to resolve the absence of bf16 support in
NumPy, a third-party package [ml_dtypes
](https://github.com/jax-ml/ml_dtypes) is introduced to add bf16
extension, and the same approach was used in `torch-mlir` project.

See motivation and discussion in:
https://discourse.llvm.org/t/how-to-run-executionengine-with-bf16-dtype-in-mlir-python-bindings/79025
---
 mlir/python/mlir/runtime/np_to_memref.py | 19 +++++++++++
 mlir/python/requirements.txt             |  3 +-
 mlir/test/python/execution_engine.py     | 40 ++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/mlir/python/mlir/runtime/np_to_memref.py b/mlir/python/mlir/runtime/np_to_memref.py
index f6b706f9bc8ae2..882b2751921bfd 100644
--- a/mlir/python/mlir/runtime/np_to_memref.py
+++ b/mlir/python/mlir/runtime/np_to_memref.py
@@ -7,6 +7,12 @@
 import numpy as np
 import ctypes
 
+try:
+    import ml_dtypes
+except ModuleNotFoundError:
+    # The third-party ml_dtypes provides some optional low precision data-types for NumPy.
+    ml_dtypes = None
+
 
 class C128(ctypes.Structure):
     """A ctype representation for MLIR's Double Complex."""
@@ -26,6 +32,12 @@ class F16(ctypes.Structure):
     _fields_ = [("f16", ctypes.c_int16)]
 
 
+class BF16(ctypes.Structure):
+    """A ctype representation for MLIR's BFloat16."""
+
+    _fields_ = [("bf16", ctypes.c_int16)]
+
+
 # https://stackoverflow.com/questions/26921836/correct-way-to-test-for-numpy-dtype
 def as_ctype(dtp):
     """Converts dtype to ctype."""
@@ -35,6 +47,8 @@ def as_ctype(dtp):
         return C64
     if dtp == np.dtype(np.float16):
         return F16
+    if ml_dtypes is not None and dtp == ml_dtypes.bfloat16:
+        return BF16
     return np.ctypeslib.as_ctypes_type(dtp)
 
 
@@ -46,6 +60,11 @@ def to_numpy(array):
         return array.view("complex64")
     if array.dtype == F16:
         return array.view("float16")
+    assert not (
+        array.dtype == BF16 and ml_dtypes is None
+    ), f"bfloat16 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
+    if array.dtype == BF16:
+        return array.view("bfloat16")
     return array
 
 
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index acd6dbb25edaf5..6ec63e43adf896 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.19.5, <=1.26
 pybind11>=2.9.0, <=2.10.3
-PyYAML>=5.3.1, <=6.0.1
\ No newline at end of file
+PyYAML>=5.3.1, <=6.0.1
+ml_dtypes   # provides several NumPy dtype extensions, including the bf16
\ No newline at end of file
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index e8b47007a8907d..8125bf3fb8fc92 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,6 +5,7 @@
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
+from ml_dtypes import bfloat16
 
 
 # Log everything to stderr and flush so that we have a unified stream to match
@@ -521,6 +522,45 @@ def testComplexUnrankedMemrefAdd():
 run(testComplexUnrankedMemrefAdd)
 
 
+# Test bf16 memrefs
+# CHECK-LABEL: TEST: testBF16Memref
+def testBF16Memref():
+    with Context():
+        module = Module.parse(
+            """
+    module  {
+      func.func @main(%arg0: memref<1xbf16>,
+                      %arg1: memref<1xbf16>) attributes { llvm.emit_c_interface } {
+        %0 = arith.constant 0 : index
+        %1 = memref.load %arg0[%0] : memref<1xbf16>
+        memref.store %1, %arg1[%0] : memref<1xbf16>
+        return
+      }
+    } """
+        )
+
+        arg1 = np.array([0.5]).astype(bfloat16)
+        arg2 = np.array([0.0]).astype(bfloat16)
+
+        arg1_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg1))
+        )
+        arg2_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg2))
+        )
+
+        execution_engine = ExecutionEngine(lowerToLLVM(module))
+        execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
+
+        # test to-numpy utility
+        # CHECK: [0.5]
+        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        log(npout)
+
+
+run(testBF16Memref)
+
+
 #  Test addition of two 2d_memref
 # CHECK-LABEL: TEST: testDynamicMemrefAdd2D
 def testDynamicMemrefAdd2D():

From 49ef21d7674fa8267d674879e21b69d9ca4e6203 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 22:02:02 -0700
Subject: [PATCH 226/230] Remove debug print from CI generation script (NFC)

---
 .ci/generate-buildkite-pipeline-premerge | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index bb7d2117e277fb..033ab804b165ea 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -56,7 +56,6 @@ function compute-projects-to-test() {
   isForWindows=$1
   shift
   projects=${@}
-  echo "isForWindows : $isForWindows ; projects: $projects " >&2
   for project in ${projects}; do
     echo "${project}"
     case ${project} in

From e6821dd8c8cdd0279000f9a8eb57caf7977d68db Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 29 May 2024 23:21:04 -0600
Subject: [PATCH 227/230] Revert "[MLIR][Python] add ctype python binding
 support for bf16" (#93771)

Reverts llvm/llvm-project#92489

This broke the bots.
---
 mlir/python/mlir/runtime/np_to_memref.py | 19 -----------
 mlir/python/requirements.txt             |  3 +-
 mlir/test/python/execution_engine.py     | 40 ------------------------
 3 files changed, 1 insertion(+), 61 deletions(-)

diff --git a/mlir/python/mlir/runtime/np_to_memref.py b/mlir/python/mlir/runtime/np_to_memref.py
index 882b2751921bfd..f6b706f9bc8ae2 100644
--- a/mlir/python/mlir/runtime/np_to_memref.py
+++ b/mlir/python/mlir/runtime/np_to_memref.py
@@ -7,12 +7,6 @@
 import numpy as np
 import ctypes
 
-try:
-    import ml_dtypes
-except ModuleNotFoundError:
-    # The third-party ml_dtypes provides some optional low precision data-types for NumPy.
-    ml_dtypes = None
-
 
 class C128(ctypes.Structure):
     """A ctype representation for MLIR's Double Complex."""
@@ -32,12 +26,6 @@ class F16(ctypes.Structure):
     _fields_ = [("f16", ctypes.c_int16)]
 
 
-class BF16(ctypes.Structure):
-    """A ctype representation for MLIR's BFloat16."""
-
-    _fields_ = [("bf16", ctypes.c_int16)]
-
-
 # https://stackoverflow.com/questions/26921836/correct-way-to-test-for-numpy-dtype
 def as_ctype(dtp):
     """Converts dtype to ctype."""
@@ -47,8 +35,6 @@ def as_ctype(dtp):
         return C64
     if dtp == np.dtype(np.float16):
         return F16
-    if ml_dtypes is not None and dtp == ml_dtypes.bfloat16:
-        return BF16
     return np.ctypeslib.as_ctypes_type(dtp)
 
 
@@ -60,11 +46,6 @@ def to_numpy(array):
         return array.view("complex64")
     if array.dtype == F16:
         return array.view("float16")
-    assert not (
-        array.dtype == BF16 and ml_dtypes is None
-    ), f"bfloat16 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
-    if array.dtype == BF16:
-        return array.view("bfloat16")
     return array
 
 
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index 6ec63e43adf896..acd6dbb25edaf5 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,4 +1,3 @@
 numpy>=1.19.5, <=1.26
 pybind11>=2.9.0, <=2.10.3
-PyYAML>=5.3.1, <=6.0.1
-ml_dtypes   # provides several NumPy dtype extensions, including the bf16
\ No newline at end of file
+PyYAML>=5.3.1, <=6.0.1
\ No newline at end of file
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 8125bf3fb8fc92..e8b47007a8907d 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,7 +5,6 @@
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
-from ml_dtypes import bfloat16
 
 
 # Log everything to stderr and flush so that we have a unified stream to match
@@ -522,45 +521,6 @@ def testComplexUnrankedMemrefAdd():
 run(testComplexUnrankedMemrefAdd)
 
 
-# Test bf16 memrefs
-# CHECK-LABEL: TEST: testBF16Memref
-def testBF16Memref():
-    with Context():
-        module = Module.parse(
-            """
-    module  {
-      func.func @main(%arg0: memref<1xbf16>,
-                      %arg1: memref<1xbf16>) attributes { llvm.emit_c_interface } {
-        %0 = arith.constant 0 : index
-        %1 = memref.load %arg0[%0] : memref<1xbf16>
-        memref.store %1, %arg1[%0] : memref<1xbf16>
-        return
-      }
-    } """
-        )
-
-        arg1 = np.array([0.5]).astype(bfloat16)
-        arg2 = np.array([0.0]).astype(bfloat16)
-
-        arg1_memref_ptr = ctypes.pointer(
-            ctypes.pointer(get_ranked_memref_descriptor(arg1))
-        )
-        arg2_memref_ptr = ctypes.pointer(
-            ctypes.pointer(get_ranked_memref_descriptor(arg2))
-        )
-
-        execution_engine = ExecutionEngine(lowerToLLVM(module))
-        execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
-
-        # test to-numpy utility
-        # CHECK: [0.5]
-        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
-        log(npout)
-
-
-run(testBF16Memref)
-
-
 #  Test addition of two 2d_memref
 # CHECK-LABEL: TEST: testDynamicMemrefAdd2D
 def testDynamicMemrefAdd2D():

From 3e023d87d8e9a7bcf0a2feb2cee9b9ca47643a7e Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 30 May 2024 07:48:59 +0200
Subject: [PATCH 228/230] [lldb] Remove DWARFDebugInfo DIERef footguns (#92894)

DWARFDebugInfo doesn't know how to resolve the "file_index" component of
a DIERef. This patch removes GetUnit (in favor of existing
GetUnitContainingDIEOffset) and changes GetDIE to take only the
components it actually uses.
---
 .../Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp       | 11 +++--------
 lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h |  3 +--
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp      |  7 ++++---
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp   |  2 +-
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
index d28da728728e5f..c37cc91e08ed12 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
@@ -222,10 +222,6 @@ DWARFUnit *DWARFDebugInfo::GetUnitAtOffset(DIERef::Section section,
   return result;
 }
 
-DWARFUnit *DWARFDebugInfo::GetUnit(const DIERef &die_ref) {
-  return GetUnitContainingDIEOffset(die_ref.section(), die_ref.die_offset());
-}
-
 DWARFUnit *
 DWARFDebugInfo::GetUnitContainingDIEOffset(DIERef::Section section,
                                            dw_offset_t die_offset) {
@@ -253,9 +249,8 @@ bool DWARFDebugInfo::ContainsTypeUnits() {
 //
 // Get the DIE (Debug Information Entry) with the specified offset.
 DWARFDIE
-DWARFDebugInfo::GetDIE(const DIERef &die_ref) {
-  DWARFUnit *cu = GetUnit(die_ref);
-  if (cu)
-    return cu->GetNonSkeletonUnit().GetDIE(die_ref.die_offset());
+DWARFDebugInfo::GetDIE(DIERef::Section section, dw_offset_t die_offset) {
+  if (DWARFUnit *cu = GetUnitContainingDIEOffset(section, die_offset))
+    return cu->GetNonSkeletonUnit().GetDIE(die_offset);
   return DWARFDIE(); // Not found
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
index 456ebd908ccb22..4706b55d38ea98 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
@@ -38,11 +38,10 @@ class DWARFDebugInfo {
                              uint32_t *idx_ptr = nullptr);
   DWARFUnit *GetUnitContainingDIEOffset(DIERef::Section section,
                                         dw_offset_t die_offset);
-  DWARFUnit *GetUnit(const DIERef &die_ref);
   DWARFUnit *GetSkeletonUnit(DWARFUnit *dwo_unit);
   DWARFTypeUnit *GetTypeUnitForHash(uint64_t hash);
   bool ContainsTypeUnits();
-  DWARFDIE GetDIE(const DIERef &die_ref);
+  DWARFDIE GetDIE(DIERef::Section section, dw_offset_t die_offset);
 
   enum {
     eDumpFlag_Verbose = (1 << 0),  // Verbose dumping
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index bc489e5b8ad465..661e4a78a02159 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1761,7 +1761,8 @@ SymbolFileDWARF::GetDIE(const DIERef &die_ref) {
     if (SymbolFileDWARFDebugMap *debug_map = GetDebugMapSymfile()) {
       symbol_file = debug_map->GetSymbolFileByOSOIndex(*file_index); // OSO case
       if (symbol_file)
-        return symbol_file->DebugInfo().GetDIE(die_ref);
+        return symbol_file->DebugInfo().GetDIE(die_ref.section(),
+                                               die_ref.die_offset());
       return DWARFDIE();
     }
 
@@ -1778,7 +1779,7 @@ SymbolFileDWARF::GetDIE(const DIERef &die_ref) {
   if (symbol_file)
     return symbol_file->GetDIE(die_ref);
 
-  return DebugInfo().GetDIE(die_ref);
+  return DebugInfo().GetDIE(die_ref.section(), die_ref.die_offset());
 }
 
 /// Return the DW_AT_(GNU_)dwo_id.
@@ -3786,7 +3787,7 @@ SymbolFileDWARF::FindBlockContainingSpecification(
   // Give the concrete function die specified by "func_die_offset", find the
   // concrete block whose DW_AT_specification or DW_AT_abstract_origin points
   // to "spec_block_die_offset"
-  return FindBlockContainingSpecification(DebugInfo().GetDIE(func_die_ref),
+  return FindBlockContainingSpecification(GetDIE(func_die_ref),
                                           spec_block_die_offset);
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 8fd369c65f86b6..e4db39cabf6fee 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -145,7 +145,7 @@ SymbolFileDWARFDwo::GetTypeSystemForLanguage(LanguageType language) {
 DWARFDIE
 SymbolFileDWARFDwo::GetDIE(const DIERef &die_ref) {
   if (die_ref.file_index() == GetFileIndex())
-    return DebugInfo().GetDIE(die_ref);
+    return DebugInfo().GetDIE(die_ref.section(), die_ref.die_offset());
   return GetBaseSymbolFile().GetDIE(die_ref);
 }
 

From 498da62088b22ef1d4e90d6021a80ae7bab6abae Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 1 Feb 2024 02:07:16 -0300
Subject: [PATCH 229/230] [NFC] [clang] add tests for merging of
 UsingShadowDecl

---
 clang/test/Modules/cxx20-decls.cppm | 35 +++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 clang/test/Modules/cxx20-decls.cppm

diff --git a/clang/test/Modules/cxx20-decls.cppm b/clang/test/Modules/cxx20-decls.cppm
new file mode 100644
index 00000000000000..9f0c40685b68f5
--- /dev/null
+++ b/clang/test/Modules/cxx20-decls.cppm
@@ -0,0 +1,35 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 -I %t %t/A.cppm -emit-module-interface -o %t/A.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -I %t %t/B.cpp -fmodule-file=A=%t/A.pcm -fsyntax-only -verify -ast-dump-all -ast-dump-filter baz | FileCheck %s
+
+//--- foo.h
+namespace baz {
+  using foo = char;
+  using baz::foo;
+}
+
+//--- A.cppm
+// expected-no-diagnostics
+module;
+#include "foo.h"
+export module A;
+
+//--- B.cpp
+// expected-no-diagnostics
+#include "foo.h"
+import A;
+// Since modules are loaded lazily, force loading by performing a lookup.
+using xxx = baz::foo;
+
+// CHECK-LABEL: Dumping baz:
+// CHECK-NEXT: NamespaceDecl 0x[[BAZ_REDECL_ADDR:[^ ]*]] prev 0x[[BAZ_ADDR:[^ ]*]]
+// CHECK:      TypeAliasDecl 0x[[ALIAS_REDECL_ADDR:[^ ]*]] prev 0x[[ALIAS_ADDR:[^ ]*]]
+// FIXME: UsingShadowDecl should have been merged
+// CHECK:      UsingShadowDecl 0x{{[^ ]*}} <{{.*}}> col:{{.*}} imported in A.<global> hidden implicit TypeAlias 0x[[ALIAS_REDECL_ADDR]] 'foo'
+
+// CHECK-LABEL: Dumping baz:
+// CHECK-NEXT: NamespaceDecl 0x[[BAZ_ADDR]] <{{.*}}> line:{{.*}} baz
+// CHECK:      UsingShadowDecl 0x[[SHADOW_ADDR:[^ ]*]] <{{.*}}> col:{{.*}} implicit TypeAlias 0x[[ALIAS_ADDR]] 'foo'

From 6a3982f8b7e37987659706cb3e6427c54c9bc7ce Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 30 May 2024 07:58:13 +0200
Subject: [PATCH 230/230] [MLIR][LLVM] Relax the LLVM dialect's inliner
 assuming UCF (#93514)

This commit changes the LLVM dialect's inliner interface to stop
assuming that the inlined function only contained unstructured control
flow. This is not necessarily true, and it lead to not properly
propagating the noalias information.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp   | 23 ++++----
 .../Dialect/LLVMIR/inlining-alias-scopes.mlir | 59 +++++++++++++++----
 2 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index 4a6154ea6d3004..5552dc5e244b84 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -187,7 +187,7 @@ deepCloneAliasScopes(iterator_range<Region::iterator> inlinedBlocks) {
   };
 
   for (Block &block : inlinedBlocks) {
-    for (Operation &op : block) {
+    block.walk([&](Operation *op) {
       if (auto aliasInterface = dyn_cast<LLVM::AliasAnalysisOpInterface>(op)) {
         aliasInterface.setAliasScopes(
             convertScopeList(aliasInterface.getAliasScopesOrNull()));
@@ -202,7 +202,7 @@ deepCloneAliasScopes(iterator_range<Region::iterator> inlinedBlocks) {
         noAliasScope.setScopeAttr(cast<LLVM::AliasScopeAttr>(
             mapping.lookup(noAliasScope.getScopeAttr())));
       }
-    }
+    });
   }
 }
 
@@ -357,9 +357,7 @@ static void createNewAliasScopesFromNoAliasParameter(
   // Go through every instruction and attempt to find which noalias parameters
   // it is definitely based on and definitely not based on.
   for (Block &inlinedBlock : inlinedBlocks) {
-    for (auto aliasInterface :
-         inlinedBlock.getOps<LLVM::AliasAnalysisOpInterface>()) {
-
+    inlinedBlock.walk([&](LLVM::AliasAnalysisOpInterface aliasInterface) {
       // Collect the pointer arguments affected by the alias scopes.
       SmallVector<Value> pointerArgs = aliasInterface.getAccessedOperands();
 
@@ -395,7 +393,7 @@ static void createNewAliasScopesFromNoAliasParameter(
             }
             return true;
           }))
-        continue;
+        return;
 
       // Add all noalias parameter scopes to the noalias scope list that we are
       // not based on.
@@ -438,7 +436,7 @@ static void createNewAliasScopesFromNoAliasParameter(
       // arguments.
       if (aliasesOtherKnownObject ||
           isa<LLVM::CallOp>(aliasInterface.getOperation()))
-        continue;
+        return;
 
       SmallVector<Attribute> aliasScopes;
       for (LLVM::SSACopyOp noAlias : noAliasParams)
@@ -449,7 +447,7 @@ static void createNewAliasScopesFromNoAliasParameter(
         aliasInterface.setAliasScopes(
             concatArrayAttr(aliasInterface.getAliasScopesOrNull(),
                             ArrayAttr::get(call->getContext(), aliasScopes)));
-    }
+    });
   }
 }
 
@@ -472,7 +470,7 @@ appendCallOpAliasScopes(Operation *call,
   // Simply append the call op's alias and noalias scopes to any operation
   // implementing AliasAnalysisOpInterface.
   for (Block &block : inlinedBlocks) {
-    for (auto aliasInterface : block.getOps<LLVM::AliasAnalysisOpInterface>()) {
+    block.walk([&](LLVM::AliasAnalysisOpInterface aliasInterface) {
       if (aliasScopes)
         aliasInterface.setAliasScopes(concatArrayAttr(
             aliasInterface.getAliasScopesOrNull(), aliasScopes));
@@ -480,7 +478,7 @@ appendCallOpAliasScopes(Operation *call,
       if (noAliasScopes)
         aliasInterface.setNoAliasScopes(concatArrayAttr(
             aliasInterface.getNoAliasScopesOrNull(), noAliasScopes));
-    }
+    });
   }
 }
 
@@ -667,7 +665,7 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
       LLVM_DEBUG(llvm::dbgs() << "Cannot inline: callable is variadic\n");
       return false;
     }
-    // TODO: Generate aliasing metadata from noalias argument/result attributes.
+    // TODO: Generate aliasing metadata from noalias result attributes.
     if (auto attrs = funcOp.getArgAttrs()) {
       for (DictionaryAttr attrDict : attrs->getAsRange<DictionaryAttr>()) {
         if (attrDict.contains(LLVM::LLVMDialect::getInAllocaAttrName())) {
@@ -755,8 +753,7 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
       return handleByValArgument(builder, callable, argument, elementType,
                                  requestedAlignment);
     }
-    if ([[maybe_unused]] std::optional<NamedAttribute> attr =
-            argumentAttrs.getNamed(LLVM::LLVMDialect::getNoAliasAttrName())) {
+    if (argumentAttrs.contains(LLVM::LLVMDialect::getNoAliasAttrName())) {
       if (argument.use_empty())
         return argument;
 
diff --git a/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir b/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir
index 29450833bee598..0b8b60e963bb01 100644
--- a/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir
@@ -24,12 +24,15 @@ llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
   %0 = llvm.mlir.constant(5 : i64) : i64
   llvm.intr.experimental.noalias.scope.decl #alias_scope
   %2 = llvm.load %arg1 {alias_scopes = [#alias_scope], alignment = 4 : i64, noalias_scopes = [#alias_scope1]} : !llvm.ptr -> f32
-  %3 = llvm.getelementptr inbounds %arg0[%0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-  llvm.store %2, %3 {alias_scopes = [#alias_scope1], alignment = 4 : i64, noalias_scopes = [#alias_scope]} : f32, !llvm.ptr
+  "test.one_region_op"() ({
+    %3 = llvm.getelementptr inbounds %arg0[%0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %2, %3 {alias_scopes = [#alias_scope1], alignment = 4 : i64, noalias_scopes = [#alias_scope]} : f32, !llvm.ptr
+    "test.terminator"() : () -> ()
+  }) : () -> ()
   llvm.return
 }
 
-// CHECK-LABEL: llvm.func @bar
+// CHECK-LABEL: llvm.func @clone_alias_scopes
 // CHECK: llvm.intr.experimental.noalias.scope.decl #[[$BAR_LOAD]]
 // CHECK: llvm.load
 // CHECK-SAME: alias_scopes = [#[[$BAR_LOAD]]]
@@ -37,8 +40,8 @@ llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
 // CHECK: llvm.store
 // CHECK-SAME: alias_scopes = [#[[$BAR_STORE]]]
 // CHECK-SAME: noalias_scopes = [#[[$BAR_LOAD]]]
-llvm.func @bar(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
-  llvm.call @foo(%arg0, %arg2) : (!llvm.ptr, !llvm.ptr) -> ()
+llvm.func @clone_alias_scopes(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  llvm.call @foo(%arg0, %arg1) : (!llvm.ptr, !llvm.ptr) -> ()
   llvm.return
 }
 
@@ -87,9 +90,12 @@ llvm.func @callee_with_metadata(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm
   llvm.store %3, %4 {alias_scopes = [#alias_scope], alignment = 4 : i64, noalias_scopes = [#alias_scope1]} : f32, !llvm.ptr
   %5 = llvm.getelementptr inbounds %arg1[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
   llvm.store %3, %5 {alias_scopes = [#alias_scope1], alignment = 4 : i64, noalias_scopes = [#alias_scope]} : f32, !llvm.ptr
-  %6 = llvm.load %arg2 {alignment = 4 : i64} : !llvm.ptr -> f32
-  %7 = llvm.getelementptr inbounds %arg0[%2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-  llvm.store %6, %7 {alignment = 4 : i64} : f32, !llvm.ptr
+  "test.one_region_op"() ({
+    %6 = llvm.load %arg2 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %7 = llvm.getelementptr inbounds %arg0[%2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %6, %7 {alignment = 4 : i64} : f32, !llvm.ptr
+    "test.terminator"() : () -> ()
+  }) : () -> ()
   llvm.return
 }
 
@@ -105,9 +111,13 @@ llvm.func @callee_without_metadata(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !l
   llvm.store %3, %4 {alignment = 4 : i64} : f32, !llvm.ptr
   %5 = llvm.getelementptr inbounds %arg1[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
   llvm.store %3, %5 {alignment = 4 : i64} : f32, !llvm.ptr
-  %6 = llvm.load %arg2 {alignment = 4 : i64} : !llvm.ptr -> f32
-  %7 = llvm.getelementptr inbounds %arg0[%2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-  llvm.store %6, %7 {alignment = 4 : i64} : f32, !llvm.ptr
+  "test.one_region_op"() ({
+    %6 = llvm.load %arg2 {alignment = 4 : i64} : !llvm.ptr -> f32
+    %7 = llvm.getelementptr inbounds %arg0[%2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    llvm.store %6, %7 {alignment = 4 : i64} : f32, !llvm.ptr
+    "test.terminator"() : () -> ()
+  }) : () -> ()
+
   llvm.return
 }
 
@@ -394,3 +404,30 @@ llvm.func @bar(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
   llvm.call @supported_operations(%arg0, %arg2) : (!llvm.ptr, !llvm.ptr) -> ()
   llvm.return
 }
+
+// -----
+
+// CHECK-DAG: #[[DOMAIN:.*]] = #llvm.alias_scope_domain<{{.*}}>
+// CHECK-DAG: #[[$ARG_SCOPE:.*]] = #llvm.alias_scope<id = {{.*}}, domain = #[[DOMAIN]]{{(,.*)?}}>
+
+llvm.func @foo(%arg: i32)
+
+llvm.func @region(%arg0: !llvm.ptr {llvm.noalias}) {
+  "test.one_region_op"() ({
+    %1 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.call @foo(%1) : (i32) -> ()
+    "test.terminator"() : () -> ()
+  }) : () -> ()
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @noalias_with_region
+// CHECK: llvm.load
+// CHECK-SAME: alias_scopes = [#[[$ARG_SCOPE]]]
+// CHECK: llvm.call
+// CHECK-NOT: alias_scopes
+// CHECK-SAME: noalias_scopes = [#[[$ARG_SCOPE]]]
+llvm.func @noalias_with_region(%arg0: !llvm.ptr) {
+  llvm.call @region(%arg0) : (!llvm.ptr) -> ()
+  llvm.return
+}