diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 807f726b583800..86be15b72fb64a 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -93,3 +93,4 @@ b32931c5b32eb0d2cf37d688b34f8548c9674c19 b6262880b34629e9d7a72b5a42f315a3c9ed8139 39c7dc7207e76e72da21cf4fedda21b5311bf62d e80bc777749331e9519575f416c342f7626dd14d +7e5cd8f1b6c5263ed5e2cc03d60c8779a8d3e9f7 diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp index f7cc9d19123635..b15d428326ac12 100644 --- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp +++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp @@ -271,7 +271,6 @@ class ASTWalker : public RecursiveASTVisitor { // specialized template. Implicit ones are filtered out by RAV. bool VisitClassTemplateSpecializationDecl(ClassTemplateSpecializationDecl *CTSD) { - // if (CTSD->isExplicitSpecialization()) if (clang::isTemplateExplicitInstantiationOrSpecialization( CTSD->getTemplateSpecializationKind())) report(CTSD->getLocation(), @@ -279,7 +278,6 @@ class ASTWalker : public RecursiveASTVisitor { return true; } bool VisitVarTemplateSpecializationDecl(VarTemplateSpecializationDecl *VTSD) { - // if (VTSD->isExplicitSpecialization()) if (clang::isTemplateExplicitInstantiationOrSpecialization( VTSD->getTemplateSpecializationKind())) report(VTSD->getLocation(), diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index be07f81cc41b00..4679dbb68b25e1 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1547,6 +1547,7 @@ The following type trait primitives are supported by Clang. Those traits marked * ``__array_extent(type, dim)`` (Embarcadero): The ``dim``'th array bound in the type ``type``, or ``0`` if ``dim >= __array_rank(type)``. +* ``__builtin_is_implicit_lifetime`` (C++, GNU, Microsoft) * ``__builtin_is_virtual_base_of`` (C++, GNU, Microsoft) * ``__can_pass_in_regs`` (C++) Returns whether a class can be passed in registers under the current diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 39e1b0fcb09bbd..b1864901e7bddb 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -49,9 +49,36 @@ C++ Specific Potentially Breaking Changes few users and can be written as ``__is_same(__remove_cv(T), decltype(nullptr))``, which GCC supports as well. +- Clang will now correctly diagnose as ill-formed a constant expression where an + enum without a fixed underlying type is set to a value outside the range of + the enumeration's values. + + .. code-block:: c++ + + enum E { Zero, One, Two, Three, Four }; + constexpr E Val1 = (E)3; // Ok + constexpr E Val2 = (E)7; // Ok + constexpr E Val3 = (E)8; // Now ill-formed, out of the range [0, 7] + constexpr E Val4 = (E)-1; // Now ill-formed, out of the range [0, 7] + + Since Clang 16, it has been possible to suppress the diagnostic via + `-Wno-enum-constexpr-conversion`, to allow for a transition period for users. + Now, in Clang 20, **it is no longer possible to suppress the diagnostic**. + +- Extraneous template headers are now ill-formed by default. + This error can be disable with ``-Wno-error=extraneous-template-head``. + + .. code-block:: c++ + + template <> // error: extraneous template head + template + void f(); + ABI Changes in This Version --------------------------- +- Fixed Microsoft name mangling of placeholder, auto and decltype(auto), return types for MSVC 1920+. This change resolves incompatibilities with code compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by earlier versions of Clang unless such code is built with the compiler option -fms-compatibility-version=19.14 to imitate the MSVC 1914 mangling behavior. + AST Dumping Potentially Breaking Changes ---------------------------------------- @@ -95,6 +122,9 @@ C++23 Feature Support C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Add ``__builtin_is_implicit_lifetime`` intrinsic, which supports + `P2647R1 A trait for implicit lifetime types `_ + - Add ``__builtin_is_virtual_base_of`` intrinsic, which supports `P2985R0 A type trait for detecting virtual base classes `_ @@ -140,6 +170,11 @@ Modified Compiler Flags Removed Compiler Flags ------------------------- +- The compiler flag `-Wenum-constexpr-conversion` (and the `Wno-`, `Wno-error-` + derivatives) is now removed, since it's no longer possible to suppress the + diagnostic (see above). Users can expect an `unknown warning` diagnostic if + it's still in use. + Attribute Changes in Clang -------------------------- @@ -217,8 +252,10 @@ Bug Fixes to C++ Support - Clang now preserves the unexpanded flag in a lambda transform used for pack expansion. (#GH56852), (#GH85667), (#GH99877). - Fixed a bug when diagnosing ambiguous explicit specializations of constrained member functions. -- Fixed an assertion failure when selecting a function from an overload set that includes a +- Fixed an assertion failure when selecting a function from an overload set that includes a specialization of a conversion function template. +- Correctly diagnose attempts to use a concept name in its own definition; + A concept name is introduced to its scope sooner to match the C++ standard. (#GH55875) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -347,11 +384,46 @@ Improvements Moved checkers ^^^^^^^^^^^^^^ +- The checker ``alpha.security.MallocOverflow`` was deleted because it was + badly implemented and its agressive logic produced too many false positives. + To detect too large arguments passed to malloc, consider using the checker + ``alpha.taint.TaintedAlloc``. + .. _release-notes-sanitizers: Sanitizers ---------- +- Added the ``-fsanitize-overflow-pattern-exclusion=`` flag which can be used + to disable specific overflow-dependent code patterns. The supported patterns + are: ``add-overflow-test``, ``negated-unsigned-const``, and + ``post-decr-while``. The sanitizer instrumentation can be toggled off for all + available patterns by specifying ``all``. Conversely, you can disable all + exclusions with ``none``. + + .. code-block:: c++ + + /// specified with ``-fsanitize-overflow-pattern-exclusion=add-overflow-test`` + int common_overflow_check_pattern(unsigned base, unsigned offset) { + if (base + offset < base) { /* ... */ } // The pattern of `a + b < a`, and other re-orderings, won't be instrumented + } + + /// specified with ``-fsanitize-overflow-pattern-exclusion=negated-unsigned-const`` + void negation_overflow() { + unsigned long foo = -1UL; // No longer causes a negation overflow warning + unsigned long bar = -2UL; // and so on... + } + + /// specified with ``-fsanitize-overflow-pattern-exclusion=post-decr-while`` + void while_post_decrement() { + unsigned char count = 16; + while (count--) { /* ... */} // No longer causes unsigned-integer-overflow sanitizer to trip + } + + Many existing projects have a large amount of these code patterns present. + This new flag should allow those projects to enable integer sanitizers with + less noise. + Python Binding Changes ---------------------- - Fixed an issue that led to crashes when calling ``Type.get_exception_specification_kind``. diff --git a/clang/docs/UndefinedBehaviorSanitizer.rst b/clang/docs/UndefinedBehaviorSanitizer.rst index 531d56e313826c..9f3d980eefbea7 100644 --- a/clang/docs/UndefinedBehaviorSanitizer.rst +++ b/clang/docs/UndefinedBehaviorSanitizer.rst @@ -293,6 +293,48 @@ To silence reports from unsigned integer overflow, you can set ``-fsanitize-recover=unsigned-integer-overflow``, is particularly useful for providing fuzzing signal without blowing up logs. +Disabling instrumentation for common overflow patterns +------------------------------------------------------ + +There are certain overflow-dependent or overflow-prone code patterns which +produce a lot of noise for integer overflow/truncation sanitizers. Negated +unsigned constants, post-decrements in a while loop condition and simple +overflow checks are accepted and pervasive code patterns. However, the signal +received from sanitizers instrumenting these code patterns may be too noisy for +some projects. To disable instrumentation for these common patterns one should +use ``-fsanitize-overflow-pattern-exclusion=``. + +Currently, this option supports three overflow-dependent code idioms: + +``negated-unsigned-const`` + +.. code-block:: c++ + + /// -fsanitize-overflow-pattern-exclusion=negated-unsigned-const + unsigned long foo = -1UL; // No longer causes a negation overflow warning + unsigned long bar = -2UL; // and so on... + +``post-decr-while`` + +.. code-block:: c++ + + /// -fsanitize-overflow-pattern-exclusion=post-decr-while + unsigned char count = 16; + while (count--) { /* ... */ } // No longer causes unsigned-integer-overflow sanitizer to trip + +``add-overflow-test`` + +.. code-block:: c++ + + /// -fsanitize-overflow-pattern-exclusion=add-overflow-test + if (base + offset < base) { /* ... */ } // The pattern of `a + b < a`, and other re-orderings, + // won't be instrumented (same for signed types) + +You can enable all exclusions with +``-fsanitize-overflow-pattern-exclusion=all`` or disable all exclusions with +``-fsanitize-overflow-pattern-exclusion=none``. Specifying ``none`` has +precedence over other values. + Issue Suppression ================= diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 46b0b7b9c82376..0bfbc995579d41 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -2951,49 +2951,6 @@ Warn about buffer overflows (newer checker). char c = s[x]; // warn: index is tainted } -.. _alpha-security-MallocOverflow: - -alpha.security.MallocOverflow (C) -""""""""""""""""""""""""""""""""" -Check for overflows in the arguments to ``malloc()``. -It tries to catch ``malloc(n * c)`` patterns, where: - - - ``n``: a variable or member access of an object - - ``c``: a constant foldable integral - -This checker was designed for code audits, so expect false-positive reports. -One is supposed to silence this checker by ensuring proper bounds checking on -the variable in question using e.g. an ``assert()`` or a branch. - -.. code-block:: c - - void test(int n) { - void *p = malloc(n * sizeof(int)); // warn - } - - void test2(int n) { - if (n > 100) // gives an upper-bound - return; - void *p = malloc(n * sizeof(int)); // no warning - } - - void test3(int n) { - assert(n <= 100 && "Contract violated."); - void *p = malloc(n * sizeof(int)); // no warning - } - -Limitations: - - - The checker won't warn for variables involved in explicit casts, - since that might limit the variable's domain. - E.g.: ``(unsigned char)int x`` would limit the domain to ``[0,255]``. - The checker will miss the true-positive cases when the explicit cast would - not tighten the domain to prevent the overflow in the subsequent - multiplication operation. - - - It is an AST-based checker, thus it does not make use of the - path-sensitive taint-analysis. - .. _alpha-security-MmapWriteExec: alpha.security.MmapWriteExec (C) diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 561a9d872acfb0..6d84bd03de810a 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -3206,6 +3206,10 @@ class FieldDecl : public DeclaratorDecl, public Mergeable { /// Set the C++11 in-class initializer for this member. void setInClassInitializer(Expr *NewInit); + /// Find the FieldDecl specified in a FAM's "counted_by" attribute. Returns + /// \p nullptr if either the attribute or the field doesn't exist. + const FieldDecl *findCountedByField() const; + private: void setLazyInClassInitializer(LazyDeclStmtPtr NewInit); diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 5b6a6b40b28ef8..687715a22e9fd3 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -3146,19 +3146,24 @@ class ConceptDecl : public TemplateDecl, public Mergeable { : TemplateDecl(Concept, DC, L, Name, Params), ConstraintExpr(ConstraintExpr) {}; public: - static ConceptDecl *Create(ASTContext &C, DeclContext *DC, - SourceLocation L, DeclarationName Name, + static ConceptDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L, + DeclarationName Name, TemplateParameterList *Params, - Expr *ConstraintExpr); + Expr *ConstraintExpr = nullptr); static ConceptDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); Expr *getConstraintExpr() const { return ConstraintExpr; } + bool hasDefinition() const { return ConstraintExpr != nullptr; } + + void setDefinition(Expr *E) { ConstraintExpr = E; } + SourceRange getSourceRange() const override LLVM_READONLY { return SourceRange(getTemplateParameters()->getTemplateLoc(), - ConstraintExpr->getEndLoc()); + ConstraintExpr ? ConstraintExpr->getEndLoc() + : SourceLocation()); } bool isTypeConcept() const { diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 5b813bfc2faf90..f5863524723a2e 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -4043,6 +4043,15 @@ class BinaryOperator : public Expr { void setHasStoredFPFeatures(bool B) { BinaryOperatorBits.HasFPFeatures = B; } bool hasStoredFPFeatures() const { return BinaryOperatorBits.HasFPFeatures; } + /// Set and get the bit that informs arithmetic overflow sanitizers whether + /// or not they should exclude certain BinaryOperators from instrumentation + void setExcludedOverflowPattern(bool B) { + BinaryOperatorBits.ExcludedOverflowPattern = B; + } + bool hasExcludedOverflowPattern() const { + return BinaryOperatorBits.ExcludedOverflowPattern; + } + /// Get FPFeatures from trailing storage FPOptionsOverride getStoredFPFeatures() const { assert(hasStoredFPFeatures()); diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index bbd7634bcc3bfb..f1a2aac0a8b2f8 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -650,6 +650,11 @@ class alignas(void *) Stmt { LLVM_PREFERRED_TYPE(bool) unsigned HasFPFeatures : 1; + /// Whether or not this BinaryOperator should be excluded from integer + /// overflow sanitization. + LLVM_PREFERRED_TYPE(bool) + unsigned ExcludedOverflowPattern : 1; + SourceLocation OpLoc; }; diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 8ac2079099c854..10a9d9e899e007 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4467,7 +4467,7 @@ def ReleaseHandle : InheritableParamAttr { def UnsafeBufferUsage : InheritableAttr { let Spellings = [Clang<"unsafe_buffer_usage">]; - let Subjects = SubjectList<[Function]>; + let Subjects = SubjectList<[Function, Field]>; let Documentation = [UnsafeBufferUsageDocs]; } @@ -4599,12 +4599,18 @@ def HLSLResource : InheritableAttr { "CBuffer", "Sampler", "TBuffer", "RTAccelerationStructure", "FeedbackTexture2D", "FeedbackTexture2DArray" ], - /*opt=*/0, /*fake=*/0, /*isExternalType=*/1, /*isCovered=*/0>, - DefaultBoolArgument<"isROV", /*default=*/0> + /*opt=*/0, /*fake=*/0, /*isExternalType=*/1, /*isCovered=*/0> ]; let Documentation = [InternalOnly]; } +def HLSLROV : InheritableAttr { + let Spellings = [CXX11<"hlsl", "is_rov">]; + let Subjects = SubjectList<[Struct]>; + let LangOpts = [HLSL]; + let Documentation = [InternalOnly]; +} + def HLSLResourceClass : InheritableAttr { let Spellings = [CXX11<"hlsl", "resource_class">]; let Subjects = SubjectList<[Struct]>; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 94c284fc731589..19cbb9a0111a28 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6802,77 +6802,103 @@ def UnsafeBufferUsageDocs : Documentation { let Category = DocCatFunction; let Content = [{ The attribute ``[[clang::unsafe_buffer_usage]]`` should be placed on functions -that need to be avoided as they are prone to buffer overflows. It is designed to -work together with the off-by-default compiler warning ``-Wunsafe-buffer-usage`` -to help codebases transition away from raw pointer based buffer management, -in favor of safer abstractions such as C++20 ``std::span``. The attribute causes -``-Wunsafe-buffer-usage`` to warn on every use of the function, and it may -enable ``-Wunsafe-buffer-usage`` to emit automatic fix-it hints -which would help the user replace such unsafe functions with safe +that need to be avoided as they are prone to buffer overflows or unsafe buffer +struct fields. It is designed to work together with the off-by-default compiler +warning ``-Wunsafe-buffer-usage``to help codebases transition away from raw pointer +based buffer management, in favor of safer abstractions such as C++20 ``std::span``. +The attribute causes ``-Wunsafe-buffer-usage`` to warn on every use of the function or +the field it is attached to, and it may also lead to emission of automatic fix-it +hints which would help the user replace the use of unsafe functions(/fields) with safe alternatives, though the attribute can be used even when the fix can't be automated. -The attribute does not suppress ``-Wunsafe-buffer-usage`` inside the function -to which it is attached. These warnings still need to be addressed. +* Attribute attached to functions: The attribute does not suppress + ``-Wunsafe-buffer-usage`` inside the function to which it is attached. + These warnings still need to be addressed. -The attribute is warranted even if the only way a function can overflow -the buffer is by violating the function's preconditions. For example, it -would make sense to put the attribute on function ``foo()`` below because -passing an incorrect size parameter would cause a buffer overflow: + The attribute is warranted even if the only way a function can overflow + the buffer is by violating the function's preconditions. For example, it + would make sense to put the attribute on function ``foo()`` below because + passing an incorrect size parameter would cause a buffer overflow: -.. code-block:: c++ + .. code-block:: c++ - [[clang::unsafe_buffer_usage]] - void foo(int *buf, size_t size) { - for (size_t i = 0; i < size; ++i) { - buf[i] = i; + [[clang::unsafe_buffer_usage]] + void foo(int *buf, size_t size) { + for (size_t i = 0; i < size; ++i) { + buf[i] = i; + } } - } -The attribute is NOT warranted when the function uses safe abstractions, -assuming that these abstractions weren't misused outside the function. -For example, function ``bar()`` below doesn't need the attribute, -because assuming that the container ``buf`` is well-formed (has size that -fits the original buffer it refers to), overflow cannot occur: + The attribute is NOT warranted when the function uses safe abstractions, + assuming that these abstractions weren't misused outside the function. + For example, function ``bar()`` below doesn't need the attribute, + because assuming that the container ``buf`` is well-formed (has size that + fits the original buffer it refers to), overflow cannot occur: -.. code-block:: c++ + .. code-block:: c++ - void bar(std::span buf) { - for (size_t i = 0; i < buf.size(); ++i) { - buf[i] = i; + void bar(std::span buf) { + for (size_t i = 0; i < buf.size(); ++i) { + buf[i] = i; + } } - } -In this case function ``bar()`` enables the user to keep the buffer -"containerized" in a span for as long as possible. On the other hand, -Function ``foo()`` in the previous example may have internal -consistency, but by accepting a raw buffer it requires the user to unwrap -their span, which is undesirable according to the programming model -behind ``-Wunsafe-buffer-usage``. + In this case function ``bar()`` enables the user to keep the buffer + "containerized" in a span for as long as possible. On the other hand, + Function ``foo()`` in the previous example may have internal + consistency, but by accepting a raw buffer it requires the user to unwrap + their span, which is undesirable according to the programming model + behind ``-Wunsafe-buffer-usage``. -The attribute is warranted when a function accepts a raw buffer only to -immediately put it into a span: + The attribute is warranted when a function accepts a raw buffer only to + immediately put it into a span: -.. code-block:: c++ + .. code-block:: c++ - [[clang::unsafe_buffer_usage]] - void baz(int *buf, size_t size) { - std::span sp{ buf, size }; - for (size_t i = 0; i < sp.size(); ++i) { - sp[i] = i; + [[clang::unsafe_buffer_usage]] + void baz(int *buf, size_t size) { + std::span sp{ buf, size }; + for (size_t i = 0; i < sp.size(); ++i) { + sp[i] = i; + } } - } -In this case ``baz()`` does not contain any unsafe operations, but the awkward -parameter type causes the caller to unwrap the span unnecessarily. -Note that regardless of the attribute, code inside ``baz()`` isn't flagged -by ``-Wunsafe-buffer-usage`` as unsafe. It is definitely undesirable, -but if ``baz()`` is on an API surface, there is no way to improve it -to make it as safe as ``bar()`` without breaking the source and binary -compatibility with existing users of the function. In such cases -the proper solution would be to create a different function (possibly -an overload of ``baz()``) that accepts a safe container like ``bar()``, -and then use the attribute on the original ``baz()`` to help the users -update their code to use the new function. + In this case ``baz()`` does not contain any unsafe operations, but the awkward + parameter type causes the caller to unwrap the span unnecessarily. + Note that regardless of the attribute, code inside ``baz()`` isn't flagged + by ``-Wunsafe-buffer-usage`` as unsafe. It is definitely undesirable, + but if ``baz()`` is on an API surface, there is no way to improve it + to make it as safe as ``bar()`` without breaking the source and binary + compatibility with existing users of the function. In such cases + the proper solution would be to create a different function (possibly + an overload of ``baz()``) that accepts a safe container like ``bar()``, + and then use the attribute on the original ``baz()`` to help the users + update their code to use the new function. + +* Attribute attached to fields: The attribute should only be attached to + struct fields, if the fields can not be updated to a safe type with bounds + check, such as std::span. In other words, the buffers prone to unsafe accesses + should always be updated to use safe containers/views and attaching the attribute + must be last resort when such an update is infeasible. + + The attribute can be placed on individual fields or a set of them as shown below. + + .. code-block:: c++ + + struct A { + [[clang::unsafe_buffer_usage]] + int *ptr1; + + [[clang::unsafe_buffer_usage]] + int *ptr2, buf[10]; + + [[clang::unsafe_buffer_usage]] + size_t sz; + }; + + Here, every read/write to the fields ptr1, ptr2, buf and sz will trigger a warning + that the field has been explcitly marked as unsafe due to unsafe-buffer operations. + }]; } diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td index eb82e0159b56ed..f317c5ac44f32b 100644 --- a/clang/include/clang/Basic/DiagnosticASTKinds.td +++ b/clang/include/clang/Basic/DiagnosticASTKinds.td @@ -394,10 +394,9 @@ def warn_integer_constant_overflow : Warning< def warn_fixedpoint_constant_overflow : Warning< "overflow in expression; result is %0 with type %1">, InGroup>; -def warn_constexpr_unscoped_enum_out_of_range : Warning< +def note_constexpr_unscoped_enum_out_of_range : Note< "integer value %0 is outside the valid range of values [%1, %2] for the " - "enumeration type %3">, DefaultError, ShowInSystemHeader, ShowInSystemMacro, - InGroup>; + "enumeration type %3">; // This is a temporary diagnostic, and shall be removed once our // implementation is complete, and like the preceding constexpr notes belongs diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 554dbaff2ce0d8..da2f939067bfab 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3012,6 +3012,8 @@ def err_concept_no_parameters : Error< "specialization of concepts is not allowed">; def err_concept_extra_headers : Error< "extraneous template parameter list in concept definition">; +def err_recursive_concept : Error< + "a concept definition cannot refer to itself">; def err_concept_no_associated_constraints : Error< "concept cannot have associated constraints">; def err_non_constant_constraint_expression : Error< @@ -5426,7 +5428,8 @@ def err_template_spec_extra_headers : Error< "extraneous template parameter list in template specialization or " "out-of-line template definition">; def ext_template_spec_extra_headers : ExtWarn< - "extraneous template parameter list in template specialization">; + "extraneous template parameter list in template specialization">, + InGroup>, DefaultError; def note_explicit_template_spec_does_not_need_header : Note< "'template<>' header not required for explicitly-specialized class %0 " "declared here">; @@ -8968,6 +8971,8 @@ def err_atomic_op_has_invalid_synch_scope : Error< def warn_atomic_implicit_seq_cst : Warning< "implicit use of sequentially-consistent atomic may incur stronger memory barriers than necessary">, InGroup>, DefaultIgnore; +def err_atomic_unsupported : Error< + "atomic types are not supported in '%0'">; def err_overflow_builtin_must_be_int : Error< "operand argument to %select{overflow builtin|checked integer operation}0 " @@ -12381,7 +12386,8 @@ def warn_unsafe_buffer_variable : Warning< InGroup, DefaultIgnore; def warn_unsafe_buffer_operation : Warning< "%select{unsafe pointer operation|unsafe pointer arithmetic|" - "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data}0">, + "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|" + "field %1 prone to unsafe buffer manipulation}0">, InGroup, DefaultIgnore; def note_unsafe_buffer_operation : Note< "used%select{| in pointer arithmetic| in buffer access}0 here">; diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index d454a7ff2f8cf4..2e9f2c552aad8a 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -406,6 +406,8 @@ VALUE_LANGOPT(TrivialAutoVarInitMaxSize, 32, 0, "stop trivial automatic variable initialization if var size exceeds the specified size (in bytes). Must be greater than 0.") ENUM_LANGOPT(SignedOverflowBehavior, SignedOverflowBehaviorTy, 2, SOB_Undefined, "signed integer overflow handling") +LANGOPT(IgnoreNegationOverflow, 1, 0, "ignore overflow caused by negation") +LANGOPT(SanitizeOverflowIdioms, 1, 1, "enable instrumentation for common overflow idioms") ENUM_LANGOPT(ThreadModel , ThreadModelKind, 2, ThreadModelKind::POSIX, "Thread Model") BENIGN_LANGOPT(ArrowDepth, 32, 256, diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 91f1c2f2e6239e..eb4cb4b5a7e93f 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -367,6 +367,21 @@ class LangOptionsBase { PerThread, }; + /// Exclude certain code patterns from being instrumented by arithmetic + /// overflow sanitizers + enum OverflowPatternExclusionKind { + /// Don't exclude any overflow patterns from sanitizers + None = 1 << 0, + /// Exclude all overflow patterns (below) + All = 1 << 1, + /// if (a + b < a) + AddOverflowTest = 1 << 2, + /// -1UL + NegUnsignedConst = 1 << 3, + /// while (count--) + PostDecrInWhile = 1 << 4, + }; + enum class DefaultVisiblityExportMapping { None, /// map only explicit default visibilities to exported @@ -555,6 +570,11 @@ class LangOptions : public LangOptionsBase { /// The default stream kind used for HIP kernel launching. GPUDefaultStreamKind GPUDefaultStream; + /// Which overflow patterns should be excluded from sanitizer instrumentation + unsigned OverflowPatternExclusionMask = 0; + + std::vector OverflowPatternExclusionValues; + /// The seed used by the randomize structure layout feature. std::string RandstructSeed; @@ -630,6 +650,14 @@ class LangOptions : public LangOptionsBase { return MSCompatibilityVersion >= MajorVersion * 100000U; } + bool isOverflowPatternExcluded(OverflowPatternExclusionKind Kind) const { + if (OverflowPatternExclusionMask & OverflowPatternExclusionKind::None) + return false; + if (OverflowPatternExclusionMask & OverflowPatternExclusionKind::All) + return true; + return OverflowPatternExclusionMask & Kind; + } + /// Reset all of the options that are not considered when building a /// module. void resetNonModularOptions(); diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 421dbb413fed93..d683106bb0e298 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -502,6 +502,7 @@ TYPE_TRAIT_1(__has_trivial_move_assign, HasTrivialMoveAssign, KEYCXX) TYPE_TRAIT_1(__has_trivial_move_constructor, HasTrivialMoveConstructor, KEYCXX) // GNU and MS Type Traits +TYPE_TRAIT_1(__builtin_is_implicit_lifetime, IsImplicitLifetime, KEYCXX) TYPE_TRAIT_2(__builtin_is_virtual_base_of, IsVirtualBaseOf, KEYCXX) TYPE_TRAIT_1(__has_nothrow_assign, HasNothrowAssign, KEYCXX) TYPE_TRAIT_1(__has_nothrow_copy, HasNothrowCopy, KEYCXX) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 6df3a6a5943a97..acc1f2fde53979 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2565,6 +2565,11 @@ defm sanitize_stats : BoolOption<"f", "sanitize-stats", "Disable">, BothFlags<[], [ClangOption], " sanitizer statistics gathering.">>, Group; +def fsanitize_overflow_pattern_exclusion_EQ : CommaJoined<["-"], "fsanitize-overflow-pattern-exclusion=">, + HelpText<"Specify the overflow patterns to exclude from artihmetic sanitizer instrumentation">, + Visibility<[ClangOption, CC1Option]>, + Values<"none,all,add-overflow-test,negated-unsigned-const,post-decr-while">, + MarshallingInfoStringVector>; def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">, Group, HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">; diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 47ef175302679f..e64ec463ca8907 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -33,6 +33,7 @@ class SanitizerArgs { std::vector BinaryMetadataIgnorelistFiles; int CoverageFeatures = 0; int BinaryMetadataFeatures = 0; + int OverflowPatternExclusions = 0; int MsanTrackOrigins = 0; bool MsanUseAfterDtor = true; bool MsanParamRetval = true; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 25cb6c8fbf6104..a025ff6fc13f36 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12033,14 +12033,17 @@ class Sema final : public SemaBase { void CheckDeductionGuideTemplate(FunctionTemplateDecl *TD); - Decl *ActOnConceptDefinition(Scope *S, - MultiTemplateParamsArg TemplateParameterLists, - const IdentifierInfo *Name, - SourceLocation NameLoc, Expr *ConstraintExpr, - const ParsedAttributesView &Attrs); + ConceptDecl *ActOnStartConceptDefinition( + Scope *S, MultiTemplateParamsArg TemplateParameterLists, + const IdentifierInfo *Name, SourceLocation NameLoc); + + ConceptDecl *ActOnFinishConceptDefinition(Scope *S, ConceptDecl *C, + Expr *ConstraintExpr, + const ParsedAttributesView &Attrs); void CheckConceptRedefinition(ConceptDecl *NewDecl, LookupResult &Previous, bool &AddToScope); + bool CheckConceptUseInDefinition(ConceptDecl *Concept, SourceLocation Loc); TypeResult ActOnDependentTag(Scope *S, unsigned TagSpec, TagUseKind TUK, const CXXScopeSpec &SS, diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 2ddbee67c414bb..d60cb2a57d4918 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -56,6 +56,7 @@ class SemaHLSL : public SemaBase { void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL); void handleShaderAttr(Decl *D, const ParsedAttr &AL); + void handleROVAttr(Decl *D, const ParsedAttr &AL); void handleResourceClassAttr(Decl *D, const ParsedAttr &AL); void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL); void handleParamModifierAttr(Decl *D, const ParsedAttr &AL); diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index 38b55a0eb0a7b0..fb4114619ac3d3 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -1039,10 +1039,6 @@ def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">, HelpText<"Warn about buffer overflows (newer checker)">, Documentation; -def MallocOverflowSecurityChecker : Checker<"MallocOverflow">, - HelpText<"Check for overflows in the arguments to malloc()">, - Documentation; - def MmapWriteExecChecker : Checker<"MmapWriteExec">, HelpText<"Warn on mmap() calls that are both writable and executable">, Documentation; diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 44d944d4e948cb..70aecb781c2ff2 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -66,7 +66,6 @@ add_clang_library(clangAST InheritViz.cpp Interp/ByteCodeEmitter.cpp Interp/Compiler.cpp - Interp/CompilerComplex.cpp Interp/Context.cpp Interp/Descriptor.cpp Interp/Disasm.cpp diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index e125143bc1b270..90caf81757ac96 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -4678,6 +4678,19 @@ void FieldDecl::printName(raw_ostream &OS, const PrintingPolicy &Policy) const { DeclaratorDecl::printName(OS, Policy); } +const FieldDecl *FieldDecl::findCountedByField() const { + const auto *CAT = getType()->getAs(); + if (!CAT) + return nullptr; + + const auto *CountDRE = cast(CAT->getCountExpr()); + const auto *CountDecl = CountDRE->getDecl(); + if (const auto *IFD = dyn_cast(CountDecl)) + CountDecl = IFD->getAnonField(); + + return dyn_cast(CountDecl); +} + //===----------------------------------------------------------------------===// // TagDecl Implementation //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 9d5b8167d0ee62..57475c66a94e35 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -4759,6 +4759,53 @@ ParenListExpr *ParenListExpr::CreateEmpty(const ASTContext &Ctx, return new (Mem) ParenListExpr(EmptyShell(), NumExprs); } +/// Certain overflow-dependent code patterns can have their integer overflow +/// sanitization disabled. Check for the common pattern `if (a + b < a)` and +/// return the resulting BinaryOperator responsible for the addition so we can +/// elide overflow checks during codegen. +static std::optional +getOverflowPatternBinOp(const BinaryOperator *E) { + Expr *Addition, *ComparedTo; + if (E->getOpcode() == BO_LT) { + Addition = E->getLHS(); + ComparedTo = E->getRHS(); + } else if (E->getOpcode() == BO_GT) { + Addition = E->getRHS(); + ComparedTo = E->getLHS(); + } else { + return {}; + } + + const Expr *AddLHS = nullptr, *AddRHS = nullptr; + BinaryOperator *BO = dyn_cast(Addition); + + if (BO && BO->getOpcode() == clang::BO_Add) { + // now store addends for lookup on other side of '>' + AddLHS = BO->getLHS(); + AddRHS = BO->getRHS(); + } + + if (!AddLHS || !AddRHS) + return {}; + + const Decl *LHSDecl, *RHSDecl, *OtherDecl; + + LHSDecl = AddLHS->IgnoreParenImpCasts()->getReferencedDeclOfCallee(); + RHSDecl = AddRHS->IgnoreParenImpCasts()->getReferencedDeclOfCallee(); + OtherDecl = ComparedTo->IgnoreParenImpCasts()->getReferencedDeclOfCallee(); + + if (!OtherDecl) + return {}; + + if (!LHSDecl && !RHSDecl) + return {}; + + if ((LHSDecl && LHSDecl == OtherDecl && LHSDecl != RHSDecl) || + (RHSDecl && RHSDecl == OtherDecl && RHSDecl != LHSDecl)) + return BO; + return {}; +} + BinaryOperator::BinaryOperator(const ASTContext &Ctx, Expr *lhs, Expr *rhs, Opcode opc, QualType ResTy, ExprValueKind VK, ExprObjectKind OK, SourceLocation opLoc, @@ -4768,8 +4815,15 @@ BinaryOperator::BinaryOperator(const ASTContext &Ctx, Expr *lhs, Expr *rhs, assert(!isCompoundAssignmentOp() && "Use CompoundAssignOperator for compound assignments"); BinaryOperatorBits.OpLoc = opLoc; + BinaryOperatorBits.ExcludedOverflowPattern = 0; SubExprs[LHS] = lhs; SubExprs[RHS] = rhs; + if (Ctx.getLangOpts().isOverflowPatternExcluded( + LangOptions::OverflowPatternExclusionKind::AddOverflowTest)) { + std::optional Result = getOverflowPatternBinOp(this); + if (Result.has_value()) + Result.value()->BinaryOperatorBits.ExcludedOverflowPattern = 1; + } BinaryOperatorBits.HasFPFeatures = FPFeatures.requiresTrailingStorage(); if (hasStoredFPFeatures()) setStoredFPFeatures(FPFeatures); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4d2d05307a6de9..09edbb6641650a 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -14508,14 +14508,12 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) { if (ED->getNumNegativeBits() && ConstexprVar && (Max.slt(Result.getInt().getSExtValue()) || Min.sgt(Result.getInt().getSExtValue()))) - Info.Ctx.getDiagnostics().Report( - E->getExprLoc(), diag::warn_constexpr_unscoped_enum_out_of_range) + Info.CCEDiag(E, diag::note_constexpr_unscoped_enum_out_of_range) << llvm::toString(Result.getInt(), 10) << Min.getSExtValue() << Max.getSExtValue() << ED; else if (!ED->getNumNegativeBits() && ConstexprVar && Max.ult(Result.getInt().getZExtValue())) - Info.Ctx.getDiagnostics().Report( - E->getExprLoc(), diag::warn_constexpr_unscoped_enum_out_of_range) + Info.CCEDiag(E, diag::note_constexpr_unscoped_enum_out_of_range) << llvm::toString(Result.getInt(), 10) << Min.getZExtValue() << Max.getZExtValue() << ED; } diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp index 3b42590a61eb50..d32b595f9f0724 100644 --- a/clang/lib/AST/Interp/Compiler.cpp +++ b/clang/lib/AST/Interp/Compiler.cpp @@ -982,6 +982,234 @@ bool Compiler::VisitLogicalBinOp(const BinaryOperator *E) { return true; } +template +bool Compiler::VisitComplexBinOp(const BinaryOperator *E) { + // Prepare storage for result. + if (!Initializing) { + std::optional LocalIndex = allocateLocal(E); + if (!LocalIndex) + return false; + if (!this->emitGetPtrLocal(*LocalIndex, E)) + return false; + } + + // Both LHS and RHS might _not_ be of complex type, but one of them + // needs to be. + const Expr *LHS = E->getLHS(); + const Expr *RHS = E->getRHS(); + + PrimType ResultElemT = this->classifyComplexElementType(E->getType()); + unsigned ResultOffset = ~0u; + if (!DiscardResult) + ResultOffset = this->allocateLocalPrimitive(E, PT_Ptr, true, false); + + // Save result pointer in ResultOffset + if (!this->DiscardResult) { + if (!this->emitDupPtr(E)) + return false; + if (!this->emitSetLocal(PT_Ptr, ResultOffset, E)) + return false; + } + QualType LHSType = LHS->getType(); + if (const auto *AT = LHSType->getAs()) + LHSType = AT->getValueType(); + QualType RHSType = RHS->getType(); + if (const auto *AT = RHSType->getAs()) + RHSType = AT->getValueType(); + + bool LHSIsComplex = LHSType->isAnyComplexType(); + unsigned LHSOffset; + bool RHSIsComplex = RHSType->isAnyComplexType(); + + // For ComplexComplex Mul, we have special ops to make their implementation + // easier. + BinaryOperatorKind Op = E->getOpcode(); + if (Op == BO_Mul && LHSIsComplex && RHSIsComplex) { + assert(classifyPrim(LHSType->getAs()->getElementType()) == + classifyPrim(RHSType->getAs()->getElementType())); + PrimType ElemT = + classifyPrim(LHSType->getAs()->getElementType()); + if (!this->visit(LHS)) + return false; + if (!this->visit(RHS)) + return false; + return this->emitMulc(ElemT, E); + } + + if (Op == BO_Div && RHSIsComplex) { + QualType ElemQT = RHSType->getAs()->getElementType(); + PrimType ElemT = classifyPrim(ElemQT); + // If the LHS is not complex, we still need to do the full complex + // division, so just stub create a complex value and stub it out with + // the LHS and a zero. + + if (!LHSIsComplex) { + // This is using the RHS type for the fake-complex LHS. + if (auto LHSO = allocateLocal(RHS)) + LHSOffset = *LHSO; + else + return false; + + if (!this->emitGetPtrLocal(LHSOffset, E)) + return false; + + if (!this->visit(LHS)) + return false; + // real is LHS + if (!this->emitInitElem(ElemT, 0, E)) + return false; + // imag is zero + if (!this->visitZeroInitializer(ElemT, ElemQT, E)) + return false; + if (!this->emitInitElem(ElemT, 1, E)) + return false; + } else { + if (!this->visit(LHS)) + return false; + } + + if (!this->visit(RHS)) + return false; + return this->emitDivc(ElemT, E); + } + + // Evaluate LHS and save value to LHSOffset. + if (LHSType->isAnyComplexType()) { + LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false); + if (!this->visit(LHS)) + return false; + if (!this->emitSetLocal(PT_Ptr, LHSOffset, E)) + return false; + } else { + PrimType LHST = classifyPrim(LHSType); + LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false); + if (!this->visit(LHS)) + return false; + if (!this->emitSetLocal(LHST, LHSOffset, E)) + return false; + } + + // Same with RHS. + unsigned RHSOffset; + if (RHSType->isAnyComplexType()) { + RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false); + if (!this->visit(RHS)) + return false; + if (!this->emitSetLocal(PT_Ptr, RHSOffset, E)) + return false; + } else { + PrimType RHST = classifyPrim(RHSType); + RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false); + if (!this->visit(RHS)) + return false; + if (!this->emitSetLocal(RHST, RHSOffset, E)) + return false; + } + + // For both LHS and RHS, either load the value from the complex pointer, or + // directly from the local variable. For index 1 (i.e. the imaginary part), + // just load 0 and do the operation anyway. + auto loadComplexValue = [this](bool IsComplex, bool LoadZero, + unsigned ElemIndex, unsigned Offset, + const Expr *E) -> bool { + if (IsComplex) { + if (!this->emitGetLocal(PT_Ptr, Offset, E)) + return false; + return this->emitArrayElemPop(classifyComplexElementType(E->getType()), + ElemIndex, E); + } + if (ElemIndex == 0 || !LoadZero) + return this->emitGetLocal(classifyPrim(E->getType()), Offset, E); + return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(), + E); + }; + + // Now we can get pointers to the LHS and RHS from the offsets above. + for (unsigned ElemIndex = 0; ElemIndex != 2; ++ElemIndex) { + // Result pointer for the store later. + if (!this->DiscardResult) { + if (!this->emitGetLocal(PT_Ptr, ResultOffset, E)) + return false; + } + + // The actual operation. + switch (Op) { + case BO_Add: + if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) + return false; + if (ResultElemT == PT_Float) { + if (!this->emitAddf(getRoundingMode(E), E)) + return false; + } else { + if (!this->emitAdd(ResultElemT, E)) + return false; + } + break; + case BO_Sub: + if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) + return false; + if (ResultElemT == PT_Float) { + if (!this->emitSubf(getRoundingMode(E), E)) + return false; + } else { + if (!this->emitSub(ResultElemT, E)) + return false; + } + break; + case BO_Mul: + if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS)) + return false; + + if (ResultElemT == PT_Float) { + if (!this->emitMulf(getRoundingMode(E), E)) + return false; + } else { + if (!this->emitMul(ResultElemT, E)) + return false; + } + break; + case BO_Div: + assert(!RHSIsComplex); + if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS)) + return false; + + if (ResultElemT == PT_Float) { + if (!this->emitDivf(getRoundingMode(E), E)) + return false; + } else { + if (!this->emitDiv(ResultElemT, E)) + return false; + } + break; + + default: + return false; + } + + if (!this->DiscardResult) { + // Initialize array element with the value we just computed. + if (!this->emitInitElemPop(ResultElemT, ElemIndex, E)) + return false; + } else { + if (!this->emitPop(ResultElemT, E)) + return false; + } + } + return true; +} + template bool Compiler::VisitImplicitValueInitExpr( const ImplicitValueInitExpr *E) { @@ -4917,6 +5145,113 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { return false; } +template +bool Compiler::VisitComplexUnaryOperator(const UnaryOperator *E) { + const Expr *SubExpr = E->getSubExpr(); + assert(SubExpr->getType()->isAnyComplexType()); + + if (DiscardResult) + return this->discard(SubExpr); + + std::optional ResT = classify(E); + auto prepareResult = [=]() -> bool { + if (!ResT && !Initializing) { + std::optional LocalIndex = allocateLocal(SubExpr); + if (!LocalIndex) + return false; + return this->emitGetPtrLocal(*LocalIndex, E); + } + + return true; + }; + + // The offset of the temporary, if we created one. + unsigned SubExprOffset = ~0u; + auto createTemp = [=, &SubExprOffset]() -> bool { + SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false); + if (!this->visit(SubExpr)) + return false; + return this->emitSetLocal(PT_Ptr, SubExprOffset, E); + }; + + PrimType ElemT = classifyComplexElementType(SubExpr->getType()); + auto getElem = [=](unsigned Offset, unsigned Index) -> bool { + if (!this->emitGetLocal(PT_Ptr, Offset, E)) + return false; + return this->emitArrayElemPop(ElemT, Index, E); + }; + + switch (E->getOpcode()) { + case UO_Minus: + if (!prepareResult()) + return false; + if (!createTemp()) + return false; + for (unsigned I = 0; I != 2; ++I) { + if (!getElem(SubExprOffset, I)) + return false; + if (!this->emitNeg(ElemT, E)) + return false; + if (!this->emitInitElem(ElemT, I, E)) + return false; + } + break; + + case UO_Plus: // +x + case UO_AddrOf: // &x + case UO_Deref: // *x + return this->delegate(SubExpr); + + case UO_LNot: + if (!this->visit(SubExpr)) + return false; + if (!this->emitComplexBoolCast(SubExpr)) + return false; + if (!this->emitInvBool(E)) + return false; + if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool) + return this->emitCast(PT_Bool, ET, E); + return true; + + case UO_Real: + return this->emitComplexReal(SubExpr); + + case UO_Imag: + if (!this->visit(SubExpr)) + return false; + + if (SubExpr->isLValue()) { + if (!this->emitConstUint8(1, E)) + return false; + return this->emitArrayElemPtrPopUint8(E); + } + + // Since our _Complex implementation does not map to a primitive type, + // we sometimes have to do the lvalue-to-rvalue conversion here manually. + return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E); + + case UO_Not: // ~x + if (!this->visit(SubExpr)) + return false; + // Negate the imaginary component. + if (!this->emitArrayElem(ElemT, 1, E)) + return false; + if (!this->emitNeg(ElemT, E)) + return false; + if (!this->emitInitElem(ElemT, 1, E)) + return false; + return DiscardResult ? this->emitPopPtr(E) : true; + + case UO_Extension: + return this->delegate(SubExpr); + + default: + return this->emitInvalid(E); + } + + return true; +} + template bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { if (DiscardResult) @@ -5116,6 +5451,168 @@ bool Compiler::emitPrimCast(PrimType FromT, PrimType ToT, return false; } +/// Emits __real(SubExpr) +template +bool Compiler::emitComplexReal(const Expr *SubExpr) { + assert(SubExpr->getType()->isAnyComplexType()); + + if (DiscardResult) + return this->discard(SubExpr); + + if (!this->visit(SubExpr)) + return false; + if (SubExpr->isLValue()) { + if (!this->emitConstUint8(0, SubExpr)) + return false; + return this->emitArrayElemPtrPopUint8(SubExpr); + } + + // Rvalue, load the actual element. + return this->emitArrayElemPop(classifyComplexElementType(SubExpr->getType()), + 0, SubExpr); +} + +template +bool Compiler::emitComplexBoolCast(const Expr *E) { + assert(!DiscardResult); + PrimType ElemT = classifyComplexElementType(E->getType()); + // We emit the expression (__real(E) != 0 || __imag(E) != 0) + // for us, that means (bool)E[0] || (bool)E[1] + if (!this->emitArrayElem(ElemT, 0, E)) + return false; + if (ElemT == PT_Float) { + if (!this->emitCastFloatingIntegral(PT_Bool, E)) + return false; + } else { + if (!this->emitCast(ElemT, PT_Bool, E)) + return false; + } + + // We now have the bool value of E[0] on the stack. + LabelTy LabelTrue = this->getLabel(); + if (!this->jumpTrue(LabelTrue)) + return false; + + if (!this->emitArrayElemPop(ElemT, 1, E)) + return false; + if (ElemT == PT_Float) { + if (!this->emitCastFloatingIntegral(PT_Bool, E)) + return false; + } else { + if (!this->emitCast(ElemT, PT_Bool, E)) + return false; + } + // Leave the boolean value of E[1] on the stack. + LabelTy EndLabel = this->getLabel(); + this->jump(EndLabel); + + this->emitLabel(LabelTrue); + if (!this->emitPopPtr(E)) + return false; + if (!this->emitConstBool(true, E)) + return false; + + this->fallthrough(EndLabel); + this->emitLabel(EndLabel); + + return true; +} + +template +bool Compiler::emitComplexComparison(const Expr *LHS, const Expr *RHS, + const BinaryOperator *E) { + assert(E->isComparisonOp()); + assert(!Initializing); + assert(!DiscardResult); + + PrimType ElemT; + bool LHSIsComplex; + unsigned LHSOffset; + if (LHS->getType()->isAnyComplexType()) { + LHSIsComplex = true; + ElemT = classifyComplexElementType(LHS->getType()); + LHSOffset = allocateLocalPrimitive(LHS, PT_Ptr, /*IsConst=*/true, + /*IsExtended=*/false); + if (!this->visit(LHS)) + return false; + if (!this->emitSetLocal(PT_Ptr, LHSOffset, E)) + return false; + } else { + LHSIsComplex = false; + PrimType LHST = classifyPrim(LHS->getType()); + LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false); + if (!this->visit(LHS)) + return false; + if (!this->emitSetLocal(LHST, LHSOffset, E)) + return false; + } + + bool RHSIsComplex; + unsigned RHSOffset; + if (RHS->getType()->isAnyComplexType()) { + RHSIsComplex = true; + ElemT = classifyComplexElementType(RHS->getType()); + RHSOffset = allocateLocalPrimitive(RHS, PT_Ptr, /*IsConst=*/true, + /*IsExtended=*/false); + if (!this->visit(RHS)) + return false; + if (!this->emitSetLocal(PT_Ptr, RHSOffset, E)) + return false; + } else { + RHSIsComplex = false; + PrimType RHST = classifyPrim(RHS->getType()); + RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false); + if (!this->visit(RHS)) + return false; + if (!this->emitSetLocal(RHST, RHSOffset, E)) + return false; + } + + auto getElem = [&](unsigned LocalOffset, unsigned Index, + bool IsComplex) -> bool { + if (IsComplex) { + if (!this->emitGetLocal(PT_Ptr, LocalOffset, E)) + return false; + return this->emitArrayElemPop(ElemT, Index, E); + } + return this->emitGetLocal(ElemT, LocalOffset, E); + }; + + for (unsigned I = 0; I != 2; ++I) { + // Get both values. + if (!getElem(LHSOffset, I, LHSIsComplex)) + return false; + if (!getElem(RHSOffset, I, RHSIsComplex)) + return false; + // And compare them. + if (!this->emitEQ(ElemT, E)) + return false; + + if (!this->emitCastBoolUint8(E)) + return false; + } + + // We now have two bool values on the stack. Compare those. + if (!this->emitAddUint8(E)) + return false; + if (!this->emitConstUint8(2, E)) + return false; + + if (E->getOpcode() == BO_EQ) { + if (!this->emitEQUint8(E)) + return false; + } else if (E->getOpcode() == BO_NE) { + if (!this->emitNEUint8(E)) + return false; + } else + return false; + + // In C, this returns an int. + if (PrimType ResT = classifyPrim(E->getType()); ResT != PT_Bool) + return this->emitCast(PT_Bool, ResT, E); + return true; +} + /// When calling this, we have a pointer of the local-to-destroy /// on the stack. /// Emit destruction of record types (or arrays of record types). diff --git a/clang/lib/AST/Interp/CompilerComplex.cpp b/clang/lib/AST/Interp/CompilerComplex.cpp deleted file mode 100644 index e22c72785373d1..00000000000000 --- a/clang/lib/AST/Interp/CompilerComplex.cpp +++ /dev/null @@ -1,526 +0,0 @@ -//===--- CompilerComplex.cpp.cpp --------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ByteCodeEmitter.h" -#include "Compiler.h" -#include "Context.h" -#include "Floating.h" -#include "Function.h" -#include "InterpShared.h" -#include "PrimType.h" -#include "Program.h" -#include "clang/AST/Attr.h" - -using namespace clang; -using namespace clang::interp; - -template -bool Compiler::VisitComplexBinOp(const BinaryOperator *E) { - // Prepare storage for result. - if (!Initializing) { - std::optional LocalIndex = allocateLocal(E); - if (!LocalIndex) - return false; - if (!this->emitGetPtrLocal(*LocalIndex, E)) - return false; - } - - // Both LHS and RHS might _not_ be of complex type, but one of them - // needs to be. - const Expr *LHS = E->getLHS(); - const Expr *RHS = E->getRHS(); - - PrimType ResultElemT = this->classifyComplexElementType(E->getType()); - unsigned ResultOffset = ~0u; - if (!DiscardResult) - ResultOffset = this->allocateLocalPrimitive(E, PT_Ptr, true, false); - - // Save result pointer in ResultOffset - if (!this->DiscardResult) { - if (!this->emitDupPtr(E)) - return false; - if (!this->emitSetLocal(PT_Ptr, ResultOffset, E)) - return false; - } - QualType LHSType = LHS->getType(); - if (const auto *AT = LHSType->getAs()) - LHSType = AT->getValueType(); - QualType RHSType = RHS->getType(); - if (const auto *AT = RHSType->getAs()) - RHSType = AT->getValueType(); - - bool LHSIsComplex = LHSType->isAnyComplexType(); - unsigned LHSOffset; - bool RHSIsComplex = RHSType->isAnyComplexType(); - - // For ComplexComplex Mul, we have special ops to make their implementation - // easier. - BinaryOperatorKind Op = E->getOpcode(); - if (Op == BO_Mul && LHSIsComplex && RHSIsComplex) { - assert(classifyPrim(LHSType->getAs()->getElementType()) == - classifyPrim(RHSType->getAs()->getElementType())); - PrimType ElemT = - classifyPrim(LHSType->getAs()->getElementType()); - if (!this->visit(LHS)) - return false; - if (!this->visit(RHS)) - return false; - return this->emitMulc(ElemT, E); - } - - if (Op == BO_Div && RHSIsComplex) { - QualType ElemQT = RHSType->getAs()->getElementType(); - PrimType ElemT = classifyPrim(ElemQT); - // If the LHS is not complex, we still need to do the full complex - // division, so just stub create a complex value and stub it out with - // the LHS and a zero. - - if (!LHSIsComplex) { - // This is using the RHS type for the fake-complex LHS. - if (auto LHSO = allocateLocal(RHS)) - LHSOffset = *LHSO; - else - return false; - - if (!this->emitGetPtrLocal(LHSOffset, E)) - return false; - - if (!this->visit(LHS)) - return false; - // real is LHS - if (!this->emitInitElem(ElemT, 0, E)) - return false; - // imag is zero - if (!this->visitZeroInitializer(ElemT, ElemQT, E)) - return false; - if (!this->emitInitElem(ElemT, 1, E)) - return false; - } else { - if (!this->visit(LHS)) - return false; - } - - if (!this->visit(RHS)) - return false; - return this->emitDivc(ElemT, E); - } - - // Evaluate LHS and save value to LHSOffset. - if (LHSType->isAnyComplexType()) { - LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false); - if (!this->visit(LHS)) - return false; - if (!this->emitSetLocal(PT_Ptr, LHSOffset, E)) - return false; - } else { - PrimType LHST = classifyPrim(LHSType); - LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false); - if (!this->visit(LHS)) - return false; - if (!this->emitSetLocal(LHST, LHSOffset, E)) - return false; - } - - // Same with RHS. - unsigned RHSOffset; - if (RHSType->isAnyComplexType()) { - RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false); - if (!this->visit(RHS)) - return false; - if (!this->emitSetLocal(PT_Ptr, RHSOffset, E)) - return false; - } else { - PrimType RHST = classifyPrim(RHSType); - RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false); - if (!this->visit(RHS)) - return false; - if (!this->emitSetLocal(RHST, RHSOffset, E)) - return false; - } - - // For both LHS and RHS, either load the value from the complex pointer, or - // directly from the local variable. For index 1 (i.e. the imaginary part), - // just load 0 and do the operation anyway. - auto loadComplexValue = [this](bool IsComplex, bool LoadZero, - unsigned ElemIndex, unsigned Offset, - const Expr *E) -> bool { - if (IsComplex) { - if (!this->emitGetLocal(PT_Ptr, Offset, E)) - return false; - return this->emitArrayElemPop(classifyComplexElementType(E->getType()), - ElemIndex, E); - } - if (ElemIndex == 0 || !LoadZero) - return this->emitGetLocal(classifyPrim(E->getType()), Offset, E); - return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(), - E); - }; - - // Now we can get pointers to the LHS and RHS from the offsets above. - for (unsigned ElemIndex = 0; ElemIndex != 2; ++ElemIndex) { - // Result pointer for the store later. - if (!this->DiscardResult) { - if (!this->emitGetLocal(PT_Ptr, ResultOffset, E)) - return false; - } - - // The actual operation. - switch (Op) { - case BO_Add: - if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) - return false; - - if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) - return false; - if (ResultElemT == PT_Float) { - if (!this->emitAddf(getRoundingMode(E), E)) - return false; - } else { - if (!this->emitAdd(ResultElemT, E)) - return false; - } - break; - case BO_Sub: - if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) - return false; - - if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) - return false; - if (ResultElemT == PT_Float) { - if (!this->emitSubf(getRoundingMode(E), E)) - return false; - } else { - if (!this->emitSub(ResultElemT, E)) - return false; - } - break; - case BO_Mul: - if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS)) - return false; - - if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS)) - return false; - - if (ResultElemT == PT_Float) { - if (!this->emitMulf(getRoundingMode(E), E)) - return false; - } else { - if (!this->emitMul(ResultElemT, E)) - return false; - } - break; - case BO_Div: - assert(!RHSIsComplex); - if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS)) - return false; - - if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS)) - return false; - - if (ResultElemT == PT_Float) { - if (!this->emitDivf(getRoundingMode(E), E)) - return false; - } else { - if (!this->emitDiv(ResultElemT, E)) - return false; - } - break; - - default: - return false; - } - - if (!this->DiscardResult) { - // Initialize array element with the value we just computed. - if (!this->emitInitElemPop(ResultElemT, ElemIndex, E)) - return false; - } else { - if (!this->emitPop(ResultElemT, E)) - return false; - } - } - return true; -} - -template -bool Compiler::emitComplexComparison(const Expr *LHS, const Expr *RHS, - const BinaryOperator *E) { - assert(E->isComparisonOp()); - assert(!Initializing); - assert(!DiscardResult); - - PrimType ElemT; - bool LHSIsComplex; - unsigned LHSOffset; - if (LHS->getType()->isAnyComplexType()) { - LHSIsComplex = true; - ElemT = classifyComplexElementType(LHS->getType()); - LHSOffset = allocateLocalPrimitive(LHS, PT_Ptr, /*IsConst=*/true, - /*IsExtended=*/false); - if (!this->visit(LHS)) - return false; - if (!this->emitSetLocal(PT_Ptr, LHSOffset, E)) - return false; - } else { - LHSIsComplex = false; - PrimType LHST = classifyPrim(LHS->getType()); - LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false); - if (!this->visit(LHS)) - return false; - if (!this->emitSetLocal(LHST, LHSOffset, E)) - return false; - } - - bool RHSIsComplex; - unsigned RHSOffset; - if (RHS->getType()->isAnyComplexType()) { - RHSIsComplex = true; - ElemT = classifyComplexElementType(RHS->getType()); - RHSOffset = allocateLocalPrimitive(RHS, PT_Ptr, /*IsConst=*/true, - /*IsExtended=*/false); - if (!this->visit(RHS)) - return false; - if (!this->emitSetLocal(PT_Ptr, RHSOffset, E)) - return false; - } else { - RHSIsComplex = false; - PrimType RHST = classifyPrim(RHS->getType()); - RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false); - if (!this->visit(RHS)) - return false; - if (!this->emitSetLocal(RHST, RHSOffset, E)) - return false; - } - - auto getElem = [&](unsigned LocalOffset, unsigned Index, - bool IsComplex) -> bool { - if (IsComplex) { - if (!this->emitGetLocal(PT_Ptr, LocalOffset, E)) - return false; - return this->emitArrayElemPop(ElemT, Index, E); - } - return this->emitGetLocal(ElemT, LocalOffset, E); - }; - - for (unsigned I = 0; I != 2; ++I) { - // Get both values. - if (!getElem(LHSOffset, I, LHSIsComplex)) - return false; - if (!getElem(RHSOffset, I, RHSIsComplex)) - return false; - // And compare them. - if (!this->emitEQ(ElemT, E)) - return false; - - if (!this->emitCastBoolUint8(E)) - return false; - } - - // We now have two bool values on the stack. Compare those. - if (!this->emitAddUint8(E)) - return false; - if (!this->emitConstUint8(2, E)) - return false; - - if (E->getOpcode() == BO_EQ) { - if (!this->emitEQUint8(E)) - return false; - } else if (E->getOpcode() == BO_NE) { - if (!this->emitNEUint8(E)) - return false; - } else - return false; - - // In C, this returns an int. - if (PrimType ResT = classifyPrim(E->getType()); ResT != PT_Bool) - return this->emitCast(PT_Bool, ResT, E); - return true; -} - -/// Emits __real(SubExpr) -template -bool Compiler::emitComplexReal(const Expr *SubExpr) { - assert(SubExpr->getType()->isAnyComplexType()); - - if (DiscardResult) - return this->discard(SubExpr); - - if (!this->visit(SubExpr)) - return false; - if (SubExpr->isLValue()) { - if (!this->emitConstUint8(0, SubExpr)) - return false; - return this->emitArrayElemPtrPopUint8(SubExpr); - } - - // Rvalue, load the actual element. - return this->emitArrayElemPop(classifyComplexElementType(SubExpr->getType()), - 0, SubExpr); -} - -template -bool Compiler::emitComplexBoolCast(const Expr *E) { - assert(!DiscardResult); - PrimType ElemT = classifyComplexElementType(E->getType()); - // We emit the expression (__real(E) != 0 || __imag(E) != 0) - // for us, that means (bool)E[0] || (bool)E[1] - if (!this->emitArrayElem(ElemT, 0, E)) - return false; - if (ElemT == PT_Float) { - if (!this->emitCastFloatingIntegral(PT_Bool, E)) - return false; - } else { - if (!this->emitCast(ElemT, PT_Bool, E)) - return false; - } - - // We now have the bool value of E[0] on the stack. - LabelTy LabelTrue = this->getLabel(); - if (!this->jumpTrue(LabelTrue)) - return false; - - if (!this->emitArrayElemPop(ElemT, 1, E)) - return false; - if (ElemT == PT_Float) { - if (!this->emitCastFloatingIntegral(PT_Bool, E)) - return false; - } else { - if (!this->emitCast(ElemT, PT_Bool, E)) - return false; - } - // Leave the boolean value of E[1] on the stack. - LabelTy EndLabel = this->getLabel(); - this->jump(EndLabel); - - this->emitLabel(LabelTrue); - if (!this->emitPopPtr(E)) - return false; - if (!this->emitConstBool(true, E)) - return false; - - this->fallthrough(EndLabel); - this->emitLabel(EndLabel); - - return true; -} - -template -bool Compiler::VisitComplexUnaryOperator(const UnaryOperator *E) { - const Expr *SubExpr = E->getSubExpr(); - assert(SubExpr->getType()->isAnyComplexType()); - - if (DiscardResult) - return this->discard(SubExpr); - - std::optional ResT = classify(E); - auto prepareResult = [=]() -> bool { - if (!ResT && !Initializing) { - std::optional LocalIndex = allocateLocal(SubExpr); - if (!LocalIndex) - return false; - return this->emitGetPtrLocal(*LocalIndex, E); - } - - return true; - }; - - // The offset of the temporary, if we created one. - unsigned SubExprOffset = ~0u; - auto createTemp = [=, &SubExprOffset]() -> bool { - SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false); - if (!this->visit(SubExpr)) - return false; - return this->emitSetLocal(PT_Ptr, SubExprOffset, E); - }; - - PrimType ElemT = classifyComplexElementType(SubExpr->getType()); - auto getElem = [=](unsigned Offset, unsigned Index) -> bool { - if (!this->emitGetLocal(PT_Ptr, Offset, E)) - return false; - return this->emitArrayElemPop(ElemT, Index, E); - }; - - switch (E->getOpcode()) { - case UO_Minus: - if (!prepareResult()) - return false; - if (!createTemp()) - return false; - for (unsigned I = 0; I != 2; ++I) { - if (!getElem(SubExprOffset, I)) - return false; - if (!this->emitNeg(ElemT, E)) - return false; - if (!this->emitInitElem(ElemT, I, E)) - return false; - } - break; - - case UO_Plus: // +x - case UO_AddrOf: // &x - case UO_Deref: // *x - return this->delegate(SubExpr); - - case UO_LNot: - if (!this->visit(SubExpr)) - return false; - if (!this->emitComplexBoolCast(SubExpr)) - return false; - if (!this->emitInvBool(E)) - return false; - if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool) - return this->emitCast(PT_Bool, ET, E); - return true; - - case UO_Real: - return this->emitComplexReal(SubExpr); - - case UO_Imag: - if (!this->visit(SubExpr)) - return false; - - if (SubExpr->isLValue()) { - if (!this->emitConstUint8(1, E)) - return false; - return this->emitArrayElemPtrPopUint8(E); - } - - // Since our _Complex implementation does not map to a primitive type, - // we sometimes have to do the lvalue-to-rvalue conversion here manually. - return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E); - - case UO_Not: // ~x - if (!this->visit(SubExpr)) - return false; - // Negate the imaginary component. - if (!this->emitArrayElem(ElemT, 1, E)) - return false; - if (!this->emitNeg(ElemT, E)) - return false; - if (!this->emitInitElem(ElemT, 1, E)) - return false; - return DiscardResult ? this->emitPopPtr(E) : true; - - case UO_Extension: - return this->delegate(SubExpr); - - default: - return this->emitInvalid(E); - } - - return true; -} - -namespace clang { -namespace interp { - -template class Compiler; -template class Compiler; - -} // namespace interp -} // namespace clang diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 94d613352ba227..a0571728570d3f 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -924,12 +924,12 @@ void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED, if (ED->getNumNegativeBits() && (Max.slt(Value.getSExtValue()) || Min.sgt(Value.getSExtValue()))) { const SourceLocation &Loc = S.Current->getLocation(OpPC); - S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range) + S.CCEDiag(Loc, diag::note_constexpr_unscoped_enum_out_of_range) << llvm::toString(Value, 10) << Min.getSExtValue() << Max.getSExtValue() << ED; } else if (!ED->getNumNegativeBits() && Max.ult(Value.getZExtValue())) { const SourceLocation &Loc = S.Current->getLocation(OpPC); - S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range) + S.CCEDiag(Loc, diag::note_constexpr_unscoped_enum_out_of_range) << llvm::toString(Value, 10) << Min.getZExtValue() << Max.getZExtValue() << ED; } diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index ed8d1cf1b98dd8..db8000e25dc7cc 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -408,6 +408,8 @@ class MicrosoftCXXNameMangler { void mangleSourceName(StringRef Name); void mangleNestedName(GlobalDecl GD); + void mangleAutoReturnType(QualType T, QualifierMangleMode QMM); + private: bool isStructorDecl(const NamedDecl *ND) const { return ND == Structor || getStructor(ND) == Structor; @@ -477,6 +479,11 @@ class MicrosoftCXXNameMangler { SourceRange Range); void mangleObjCKindOfType(const ObjCObjectType *T, Qualifiers Quals, SourceRange Range); + + void mangleAutoReturnType(const MemberPointerType *T, Qualifiers Quals); + void mangleAutoReturnType(const PointerType *T, Qualifiers Quals); + void mangleAutoReturnType(const LValueReferenceType *T, Qualifiers Quals); + void mangleAutoReturnType(const RValueReferenceType *T, Qualifiers Quals); }; } @@ -2494,6 +2501,57 @@ void MicrosoftCXXNameMangler::mangleAddressSpaceType(QualType T, mangleArtificialTagType(TagTypeKind::Struct, ASMangling, {"__clang"}); } +void MicrosoftCXXNameMangler::mangleAutoReturnType(QualType T, + QualifierMangleMode QMM) { + assert(getASTContext().getLangOpts().isCompatibleWithMSVC( + LangOptions::MSVC2019) && + "Cannot mangle MSVC 2017 auto return types!"); + + if (isa(T)) { + const auto *AT = T->getContainedAutoType(); + Qualifiers Quals = T.getLocalQualifiers(); + + if (QMM == QMM_Result) + Out << '?'; + if (QMM != QMM_Drop) + mangleQualifiers(Quals, false); + Out << (AT->isDecltypeAuto() ? "_T" : "_P"); + return; + } + + T = T.getDesugaredType(getASTContext()); + Qualifiers Quals = T.getLocalQualifiers(); + + switch (QMM) { + case QMM_Drop: + case QMM_Result: + break; + case QMM_Mangle: + mangleQualifiers(Quals, false); + break; + default: + llvm_unreachable("QMM_Escape unexpected"); + } + + const Type *ty = T.getTypePtr(); + switch (ty->getTypeClass()) { + case Type::MemberPointer: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::Pointer: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::LValueReference: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::RValueReference: + mangleAutoReturnType(cast(ty), Quals); + break; + default: + llvm_unreachable("Invalid type expected"); + } +} + void MicrosoftCXXNameMangler::mangleType(QualType T, SourceRange Range, QualifierMangleMode QMM) { // Don't use the canonical types. MSVC includes things like 'const' on @@ -2907,17 +2965,52 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T, // can differ by their calling convention and are typically deduced. So // we make sure that this type gets mangled properly. mangleType(ResultType, Range, QMM_Result); - } else if (const auto *AT = dyn_cast_or_null( - ResultType->getContainedAutoType())) { - Out << '?'; - mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); - Out << '?'; + } else if (IsInLambda) { + if (const auto *AT = ResultType->getContainedAutoType()) { + assert(AT->getKeyword() == AutoTypeKeyword::Auto && + "should only need to mangle auto!"); + (void)AT; + Out << '?'; + mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); + Out << '?'; + mangleSourceName(""); + Out << '@'; + } else { + Out << '@'; + } + } else if (const auto *AT = ResultType->getContainedAutoType()) { assert(AT->getKeyword() != AutoTypeKeyword::GNUAutoType && "shouldn't need to mangle __auto_type!"); - mangleSourceName(AT->isDecltypeAuto() ? "" : ""); - Out << '@'; - } else if (IsInLambda) { - Out << '@'; + + // If we have any pointer types with the clang address space extension + // then defer to the custom clang mangling to keep backwards + // compatibility. See `mangleType(const PointerType *T, Qualifiers Quals, + // SourceRange Range)` for details. + auto UseClangMangling = [](QualType ResultType) { + QualType T = ResultType; + while (isa(T.getTypePtr())) { + T = T->getPointeeType(); + if (T.getQualifiers().hasAddressSpace()) + return true; + } + return false; + }; + + if (getASTContext().getLangOpts().isCompatibleWithMSVC( + LangOptions::MSVC2019) && + !UseClangMangling(ResultType)) { + if (D && !D->getPrimaryTemplate()) { + Out << '@'; + } else { + mangleAutoReturnType(ResultType, QMM_Result); + } + } else { + Out << '?'; + mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); + Out << '?'; + mangleSourceName(AT->isDecltypeAuto() ? "" : ""); + Out << '@'; + } } else { if (ResultType->isVoidType()) ResultType = ResultType.getUnqualifiedType(); @@ -4220,6 +4313,57 @@ void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL, Mangler.getStream() << '@'; } +void MicrosoftCXXNameMangler::mangleAutoReturnType(const MemberPointerType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + manglePointerCVQualifiers(Quals); + manglePointerExtQualifiers(Quals, PointeeType); + if (const FunctionProtoType *FPT = PointeeType->getAs()) { + Out << '8'; + mangleName(T->getClass()->castAs()->getDecl()); + mangleFunctionType(FPT, nullptr, true); + } else { + mangleQualifiers(PointeeType.getQualifiers(), true); + mangleName(T->getClass()->castAs()->getDecl()); + mangleAutoReturnType(PointeeType, QMM_Drop); + } +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const PointerType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!PointeeType.getQualifiers().hasAddressSpace() && + "Unexpected address space mangling required"); + + manglePointerCVQualifiers(Quals); + manglePointerExtQualifiers(Quals, PointeeType); + + if (const FunctionProtoType *FPT = PointeeType->getAs()) { + Out << '6'; + mangleFunctionType(FPT); + } else { + mangleAutoReturnType(PointeeType, QMM_Mangle); + } +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const LValueReferenceType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!"); + Out << 'A'; + manglePointerExtQualifiers(Quals, PointeeType); + mangleAutoReturnType(PointeeType, QMM_Mangle); +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const RValueReferenceType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!"); + Out << "$$Q"; + manglePointerExtQualifiers(Quals, PointeeType); + mangleAutoReturnType(PointeeType, QMM_Mangle); +} + MicrosoftMangleContext *MicrosoftMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags, bool IsAux) { diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index 866222380974b6..051381edabf0b2 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -926,22 +926,27 @@ class CArrayToPtrAssignmentGadget : public FixableGadget { /// A call of a function or method that performs unchecked buffer operations /// over one of its pointer parameters. class UnsafeBufferUsageAttrGadget : public WarningGadget { - constexpr static const char *const OpTag = "call_expr"; - const CallExpr *Op; + constexpr static const char *const OpTag = "attr_expr"; + const Expr *Op; public: UnsafeBufferUsageAttrGadget(const MatchFinder::MatchResult &Result) : WarningGadget(Kind::UnsafeBufferUsageAttr), - Op(Result.Nodes.getNodeAs(OpTag)) {} + Op(Result.Nodes.getNodeAs(OpTag)) {} static bool classof(const Gadget *G) { return G->getKind() == Kind::UnsafeBufferUsageAttr; } static Matcher matcher() { + auto HasUnsafeFieldDecl = + member(fieldDecl(hasAttr(attr::UnsafeBufferUsage))); + auto HasUnsafeFnDecl = callee(functionDecl(hasAttr(attr::UnsafeBufferUsage))); - return stmt(callExpr(HasUnsafeFnDecl).bind(OpTag)); + + return stmt(anyOf(callExpr(HasUnsafeFnDecl).bind(OpTag), + memberExpr(HasUnsafeFieldDecl).bind(OpTag))); } void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index b5e5240e55be3f..1c0baeaee03632 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -987,7 +987,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type, // attribute. return nullptr; - const FieldDecl *CountedByFD = FindCountedByField(FAMDecl); + const FieldDecl *CountedByFD = FAMDecl->findCountedByField(); if (!CountedByFD) // Can't find the field referenced by the "counted_by" attribute. return nullptr; diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index f93f8dda0bd29a..0672861790633b 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1150,22 +1150,6 @@ llvm::Value *CodeGenFunction::EmitLoadOfCountedByField( getIntAlign(), "..counted_by.load"); } -const FieldDecl *CodeGenFunction::FindCountedByField(const FieldDecl *FD) { - if (!FD) - return nullptr; - - const auto *CAT = FD->getType()->getAs(); - if (!CAT) - return nullptr; - - const auto *CountDRE = cast(CAT->getCountExpr()); - const auto *CountDecl = CountDRE->getDecl(); - if (const auto *IFD = dyn_cast(CountDecl)) - CountDecl = IFD->getAnonField(); - - return dyn_cast(CountDecl); -} - void CodeGenFunction::EmitBoundsCheck(const Expr *E, const Expr *Base, llvm::Value *Index, QualType IndexType, bool Accessed) { @@ -4305,7 +4289,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, ME->isFlexibleArrayMemberLike(getContext(), StrictFlexArraysLevel) && ME->getMemberDecl()->getType()->isCountAttributedType()) { const FieldDecl *FAMDecl = dyn_cast(ME->getMemberDecl()); - if (const FieldDecl *CountFD = FindCountedByField(FAMDecl)) { + if (const FieldDecl *CountFD = FAMDecl->findCountedByField()) { if (std::optional Diff = getOffsetDifferenceInBits(*this, CountFD, FAMDecl)) { CharUnits OffsetDiff = CGM.getContext().toCharUnitsFromBits(*Diff); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 84392745ea6144..6eac2b4c54e1ba 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -24,6 +24,7 @@ #include "clang/AST/Attr.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/Expr.h" +#include "clang/AST/ParentMapContext.h" #include "clang/AST/RecordLayout.h" #include "clang/AST/StmtVisitor.h" #include "clang/Basic/CodeGenOptions.h" @@ -195,13 +196,24 @@ static bool CanElideOverflowCheck(const ASTContext &Ctx, const BinOpInfo &Op) { if (!Op.mayHaveIntegerOverflow()) return true; + const UnaryOperator *UO = dyn_cast(Op.E); + + if (UO && UO->getOpcode() == UO_Minus && + Ctx.getLangOpts().isOverflowPatternExcluded( + LangOptions::OverflowPatternExclusionKind::NegUnsignedConst) && + UO->isIntegerConstantExpr(Ctx)) + return true; + // If a unary op has a widened operand, the op cannot overflow. - if (const auto *UO = dyn_cast(Op.E)) + if (UO) return !UO->canOverflow(); // We usually don't need overflow checks for binops with widened operands. // Multiplication with promoted unsigned operands is a special case. const auto *BO = cast(Op.E); + if (BO->hasExcludedOverflowPattern()) + return true; + auto OptionalLHSTy = getUnwidenedIntegerType(Ctx, BO->getLHS()); if (!OptionalLHSTy) return false; @@ -2766,6 +2778,26 @@ llvm::Value *ScalarExprEmitter::EmitIncDecConsiderOverflowBehavior( llvm_unreachable("Unknown SignedOverflowBehaviorTy"); } +/// For the purposes of overflow pattern exclusion, does this match the +/// "while(i--)" pattern? +static bool matchesPostDecrInWhile(const UnaryOperator *UO, bool isInc, + bool isPre, ASTContext &Ctx) { + if (isInc || isPre) + return false; + + // -fsanitize-overflow-pattern-exclusion=post-decr-while + if (!Ctx.getLangOpts().isOverflowPatternExcluded( + LangOptions::OverflowPatternExclusionKind::PostDecrInWhile)) + return false; + + // all Parents (usually just one) must be a WhileStmt + for (const auto &Parent : Ctx.getParentMapContext().getParents(*UO)) + if (!Parent.get()) + return false; + + return true; +} + namespace { /// Handles check and update for lastprivate conditional variables. class OMPLastprivateConditionalUpdateRAII { @@ -2877,6 +2909,10 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, } else if (type->isIntegerType()) { QualType promotedType; bool canPerformLossyDemotionCheck = false; + + bool excludeOverflowPattern = + matchesPostDecrInWhile(E, isInc, isPre, CGF.getContext()); + if (CGF.getContext().isPromotableIntegerType(type)) { promotedType = CGF.getContext().getPromotedIntegerType(type); assert(promotedType != type && "Shouldn't promote to the same type."); @@ -2936,7 +2972,8 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, } else if (E->canOverflow() && type->isSignedIntegerOrEnumerationType()) { value = EmitIncDecConsiderOverflowBehavior(E, value, isInc); } else if (E->canOverflow() && type->isUnsignedIntegerType() && - CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow)) { + CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow) && + !excludeOverflowPattern) { value = EmitOverflowCheckedBinOp(createBinOpInfoFromIncDec( E, value, isInc, E->getFPFeaturesInEffect(CGF.getLangOpts()))); } else { diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 5e59b0f00ebd64..4bd7b6ba58de0d 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -301,7 +301,7 @@ void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) { llvm::hlsl::ResourceClass RC = HLSLResClassAttr->getResourceClass(); llvm::hlsl::ResourceKind RK = HLSLResAttr->getResourceKind(); - bool IsROV = HLSLResAttr->getIsROV(); + bool IsROV = FD->hasAttr(); llvm::hlsl::ElementType ET = calculateElementType(CGM.getContext(), Ty); BufferResBinding Binding(D->getAttr()); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 19a7feeb69d820..57e0b7f91e9bf8 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3305,10 +3305,6 @@ class CodeGenFunction : public CodeGenTypeCache { const FieldDecl *FAMDecl, uint64_t &Offset); - /// Find the FieldDecl specified in a FAM's "counted_by" attribute. Returns - /// \p nullptr if either the attribute or the field doesn't exist. - const FieldDecl *FindCountedByField(const FieldDecl *FD); - /// Build an expression accessing the "counted_by" field. llvm::Value *EmitLoadOfCountedByField(const Expr *Base, const FieldDecl *FAMDecl, diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 1dec3cd40ebd21..97381f673c2849 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -840,12 +840,13 @@ static bool isStreamingCompatible(const FunctionDecl *F) { static void diagnoseIfNeedsFPReg(DiagnosticsEngine &Diags, const StringRef ABIName, const AArch64ABIInfo &ABIInfo, - const QualType &Ty, const NamedDecl *D) { + const QualType &Ty, const NamedDecl *D, + SourceLocation loc) { const Type *HABase = nullptr; uint64_t HAMembers = 0; if (Ty->isFloatingType() || Ty->isVectorType() || ABIInfo.isHomogeneousAggregate(Ty, HABase, HAMembers)) { - Diags.Report(D->getLocation(), diag::err_target_unsupported_type_for_abi) + Diags.Report(loc, diag::err_target_unsupported_type_for_abi) << D->getDeclName() << Ty << ABIName; } } @@ -860,10 +861,11 @@ void AArch64TargetCodeGenInfo::checkFunctionABI( if (!TI.hasFeature("fp") && !ABIInfo.isSoftFloat()) { diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, - FuncDecl->getReturnType(), FuncDecl); + FuncDecl->getReturnType(), FuncDecl, + FuncDecl->getLocation()); for (ParmVarDecl *PVD : FuncDecl->parameters()) { diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, PVD->getType(), - PVD); + PVD, FuncDecl->getLocation()); } } } @@ -908,11 +910,11 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABISoftFloat( return; diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, ReturnType, - Caller); + Callee ? Callee : Caller, CallLoc); for (const CallArg &Arg : Args) diagnoseIfNeedsFPReg(CGM.getDiags(), TI.getABI(), ABIInfo, Arg.getType(), - Caller); + Callee ? Callee : Caller, CallLoc); } void AArch64TargetCodeGenInfo::checkFunctionCallABI(CodeGenModule &CGM, diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index e12416e51f8d24..5b95019c25cab6 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2271,8 +2271,7 @@ bool Driver::HandleImmediateArgs(Compilation &C) { return false; } - if (C.getArgs().hasArg(options::OPT_print_libgcc_file_name)) { - ToolChain::RuntimeLibType RLT = TC.GetRuntimeLibType(C.getArgs()); + auto initializeTargets = [&]() { const llvm::Triple Triple(TC.ComputeEffectiveClangTriple(C.getArgs())); // The 'Darwin' toolchain is initialized only when its arguments are // computed. Get the default arguments for OFK_None to ensure that @@ -2282,6 +2281,12 @@ bool Driver::HandleImmediateArgs(Compilation &C) { // FIXME: For some more esoteric targets the default toolchain is not the // correct one. C.getArgsForToolChain(&TC, Triple.getArchName(), Action::OFK_None); + return Triple; + }; + + if (C.getArgs().hasArg(options::OPT_print_libgcc_file_name)) { + ToolChain::RuntimeLibType RLT = TC.GetRuntimeLibType(C.getArgs()); + const llvm::Triple Triple = initializeTargets(); RegisterEffectiveTriple TripleRAII(TC, Triple); switch (RLT) { case ToolChain::RLT_CompilerRT: @@ -2325,7 +2330,9 @@ bool Driver::HandleImmediateArgs(Compilation &C) { } if (C.getArgs().hasArg(options::OPT_print_target_triple)) { - llvm::outs() << TC.getTripleString() << "\n"; + initializeTargets(); + llvm::Triple Triple(TC.ComputeEffectiveClangTriple(C.getArgs())); + llvm::outs() << Triple.getTriple() << "\n"; return false; } diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 1fd870b72286e5..a63ee944fd1bb4 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -119,6 +119,10 @@ static SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A, static int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A, bool DiagnoseErrors); +static int parseOverflowPatternExclusionValues(const Driver &D, + const llvm::opt::Arg *A, + bool DiagnoseErrors); + /// Parse -f(no-)?sanitize-metadata= flag values, diagnosing any invalid /// components. Returns OR of members of \c BinaryMetadataFeature enumeration. static int parseBinaryMetadataFeatures(const Driver &D, const llvm::opt::Arg *A, @@ -788,6 +792,13 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC, << "fsanitize-trap=cfi"; } + for (const auto *Arg : + Args.filtered(options::OPT_fsanitize_overflow_pattern_exclusion_EQ)) { + Arg->claim(); + OverflowPatternExclusions |= + parseOverflowPatternExclusionValues(D, Arg, DiagnoseErrors); + } + // Parse -f(no-)?sanitize-coverage flags if coverage is supported by the // enabled sanitizers. for (const auto *Arg : Args) { @@ -1241,6 +1252,10 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, addSpecialCaseListOpt(Args, CmdArgs, "-fsanitize-system-ignorelist=", SystemIgnorelistFiles); + if (OverflowPatternExclusions) + Args.AddAllArgs(CmdArgs, + options::OPT_fsanitize_overflow_pattern_exclusion_EQ); + if (MsanTrackOrigins) CmdArgs.push_back(Args.MakeArgString("-fsanitize-memory-track-origins=" + Twine(MsanTrackOrigins))); @@ -1426,6 +1441,28 @@ SanitizerMask parseArgValues(const Driver &D, const llvm::opt::Arg *A, return Kinds; } +static int parseOverflowPatternExclusionValues(const Driver &D, + const llvm::opt::Arg *A, + bool DiagnoseErrors) { + int Exclusions = 0; + for (int i = 0, n = A->getNumValues(); i != n; ++i) { + const char *Value = A->getValue(i); + int E = + llvm::StringSwitch(Value) + .Case("none", LangOptionsBase::None) + .Case("all", LangOptionsBase::All) + .Case("add-overflow-test", LangOptionsBase::AddOverflowTest) + .Case("negated-unsigned-const", LangOptionsBase::NegUnsignedConst) + .Case("post-decr-while", LangOptionsBase::PostDecrInWhile) + .Default(0); + if (E == 0) + D.Diag(clang::diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Value; + Exclusions |= E; + } + return Exclusions; +} + int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A, bool DiagnoseErrors) { assert(A->getOption().matches(options::OPT_fsanitize_coverage) || diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 96aa930ea28612..f2bc11839edd4d 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7769,6 +7769,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); } + Args.AddAllArgs(CmdArgs, + options::OPT_fsanitize_overflow_pattern_exclusion_EQ); + Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block, options::OPT_fno_offload_uniform_block); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index e3911c281985b7..5a5f5cb79a12f2 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4267,6 +4267,19 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val; } + if (auto *A = Args.getLastArg(OPT_fsanitize_overflow_pattern_exclusion_EQ)) { + for (int i = 0, n = A->getNumValues(); i != n; ++i) { + Opts.OverflowPatternExclusionMask |= + llvm::StringSwitch(A->getValue(i)) + .Case("none", LangOptionsBase::None) + .Case("all", LangOptionsBase::All) + .Case("add-overflow-test", LangOptionsBase::AddOverflowTest) + .Case("negated-unsigned-const", LangOptionsBase::NegUnsignedConst) + .Case("post-decr-while", LangOptionsBase::PostDecrInWhile) + .Default(0); + } + } + // Parse -fsanitize= arguments. parseSanitizerKinds("-fsanitize=", Args.getAllArgValues(OPT_fsanitize_EQ), Diags, Opts.Sanitize); diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h index 04019165068668..44c6e9a4e48d1b 100644 --- a/clang/lib/Headers/__clang_cuda_math.h +++ b/clang/lib/Headers/__clang_cuda_math.h @@ -12,6 +12,10 @@ #error "This file is for CUDA compilation only." #endif +// The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard +// libcalls reach the link step instead of being eagerly replaced. +#ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS + #ifndef __OPENMP_NVPTX__ #if CUDA_VERSION < 9000 #error This file is intended to be used with CUDA-9+ only. @@ -345,4 +349,5 @@ __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); } #pragma pop_macro("__DEVICE_VOID__") #pragma pop_macro("__FAST_OR_SLOW") +#endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS #endif // __CLANG_CUDA_MATH_H__ diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index 11e1e7d032586f..8468751d9de260 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -13,6 +13,10 @@ #error "This file is for HIP and OpenMP AMDGCN device compilation only." #endif +// The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard +// libcalls reach the link step instead of being eagerly replaced. +#ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS + #if !defined(__HIPCC_RTC__) #include #include @@ -1321,4 +1325,5 @@ __host__ inline static int max(int __arg1, int __arg2) { #pragma pop_macro("__RETURN_TYPE") #pragma pop_macro("__FAST_OR_SLOW") +#endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS #endif // __CLANG_HIP_MATH_H__ diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index a5130f56600e54..6ecfc15757f3d4 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -320,6 +320,11 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, const IdentifierInfo *Id = Result.Identifier; SourceLocation IdLoc = Result.getBeginLoc(); + // [C++26][basic.scope.pdecl]/p13 + // The locus of a concept-definition is immediately after its concept-name. + ConceptDecl *D = Actions.ActOnStartConceptDefinition( + getCurScope(), *TemplateInfo.TemplateParams, Id, IdLoc); + ParsedAttributes Attrs(AttrFactory); MaybeParseAttributes(PAKM_GNU | PAKM_CXX11, Attrs); @@ -339,9 +344,12 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, DeclEnd = Tok.getLocation(); ExpectAndConsumeSemi(diag::err_expected_semi_declaration); Expr *ConstraintExpr = ConstraintExprResult.get(); - return Actions.ActOnConceptDefinition(getCurScope(), - *TemplateInfo.TemplateParams, Id, IdLoc, - ConstraintExpr, Attrs); + + if (!D) + return nullptr; + + return Actions.ActOnFinishConceptDefinition(getCurScope(), D, ConstraintExpr, + Attrs); } /// ParseTemplateParameters - Parses a template-parameter-list enclosed in diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 0f604c61fa3af9..e6ce89dc7ec406 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2231,6 +2231,7 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { SourceLocation Loc; SourceRange Range; unsigned MsgParam = 0; + NamedDecl *D = nullptr; if (const auto *ASE = dyn_cast(Operation)) { Loc = ASE->getBase()->getExprLoc(); Range = ASE->getBase()->getSourceRange(); @@ -2261,6 +2262,12 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { // note_unsafe_buffer_operation doesn't have this mode yet. assert(!IsRelatedToDecl && "Not implemented yet!"); MsgParam = 3; + } else if (isa(Operation)) { + // note_unsafe_buffer_operation doesn't have this mode yet. + assert(!IsRelatedToDecl && "Not implemented yet!"); + auto ME = dyn_cast(Operation); + D = ME->getMemberDecl(); + MsgParam = 5; } else if (const auto *ECE = dyn_cast(Operation)) { QualType destType = ECE->getType(); if (!isa(destType)) @@ -2285,7 +2292,12 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler { "Variables blamed for unsafe buffer usage without suggestions!"); S.Diag(Loc, diag::note_unsafe_buffer_operation) << MsgParam << Range; } else { - S.Diag(Loc, diag::warn_unsafe_buffer_operation) << MsgParam << Range; + if (D) { + S.Diag(Loc, diag::warn_unsafe_buffer_operation) + << MsgParam << D << Range; + } else { + S.Diag(Loc, diag::warn_unsafe_buffer_operation) << MsgParam << Range; + } if (SuggestSuggestions) { S.Diag(Loc, diag::note_safe_buffer_usage_suggestions_disabled); } diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 6ee90d15d7a6d1..89a0e391920cc6 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -96,8 +96,11 @@ struct BuiltinTypeDeclBuilder { nullptr, false, InClassInitStyle::ICIS_NoInit); Field->setAccess(Access); Field->setImplicit(true); - for (Attr *A : Attrs) - Field->addAttr(A); + for (Attr *A : Attrs) { + if (A) + Field->addAttr(A); + } + Record->addDecl(Field); Fields[Name] = Field; return *this; @@ -116,12 +119,15 @@ struct BuiltinTypeDeclBuilder { QualType(TTD->getTypeForDecl(), 0)); } // add handle member - llvm::SmallVector Attrs; Attr *ResourceClassAttr = HLSLResourceClassAttr::CreateImplicit(Record->getASTContext(), RC); Attr *ResourceAttr = - HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK, IsROV); - addMemberVariable("h", Ty, {ResourceClassAttr, ResourceAttr}, Access); + HLSLResourceAttr::CreateImplicit(Record->getASTContext(), RK); + Attr *ROVAttr = + IsROV ? HLSLROVAttr::CreateImplicit(Record->getASTContext()) : nullptr; + addMemberVariable("h", Ty, {ResourceClassAttr, ResourceAttr, ROVAttr}, + Access); + return *this; } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index bcb1424825df00..3b5e984f4ee773 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6901,6 +6901,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_HLSLResourceBinding: S.HLSL().handleResourceBindingAttr(D, AL); break; + case ParsedAttr::AT_HLSLROV: + handleSimpleAttribute(S, D, AL); + break; case ParsedAttr::AT_HLSLResourceClass: S.HLSL().handleResourceClassAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 8defc8e1c185c0..c4aa02ff0c217d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -306,6 +306,10 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, } + if (auto *Concept = dyn_cast(D); + Concept && CheckConceptUseInDefinition(Concept, Loc)) + return true; + if (auto *MD = dyn_cast(D)) { // Lambdas are only default-constructible or assignable in C++2a onwards. if (MD->getParent()->isLambda() && @@ -6589,7 +6593,6 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, "should only occur in error-recovery path."); return CallExpr::Create(Context, Fn, ArgExprs, Context.DependentTy, VK_PRValue, RParenLoc, CurFPFeatureOverrides()); - } return BuildResolvedCallExpr(Fn, NDecl, LParenLoc, ArgExprs, RParenLoc, ExecConfig, IsExecConfig); } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 124435330ca104..5356bcf172f752 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -4948,6 +4948,20 @@ static bool DiagnoseVLAInCXXTypeTrait(Sema &S, const TypeSourceInfo *T, return true; } +/// Checks that type T is not an atomic type (_Atomic). +/// +/// @returns @c true if @p T is VLA and a diagnostic was emitted, +/// @c false otherwise. +static bool DiagnoseAtomicInCXXTypeTrait(Sema &S, const TypeSourceInfo *T, + clang::tok::TokenKind TypeTraitID) { + if (!T->getType()->isAtomicType()) + return false; + + S.Diag(T->getTypeLoc().getBeginLoc(), diag::err_atomic_unsupported) + << TypeTraitID; + return true; +} + /// Check the completeness of a type in a unary type trait. /// /// If the particular type trait requires a complete type, tries to complete @@ -5038,6 +5052,7 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, // LWG3823: T shall be an array type, a complete type, or cv void. case UTT_IsAggregate: + case UTT_IsImplicitLifetime: if (ArgTy->isArrayType() || ArgTy->isVoidType()) return true; @@ -5634,6 +5649,40 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT, return false; case UTT_IsTriviallyEqualityComparable: return isTriviallyEqualityComparableType(Self, T, KeyLoc); + case UTT_IsImplicitLifetime: { + DiagnoseVLAInCXXTypeTrait(Self, TInfo, + tok::kw___builtin_is_implicit_lifetime); + DiagnoseAtomicInCXXTypeTrait(Self, TInfo, + tok::kw___builtin_is_implicit_lifetime); + + // [basic.types.general] p9 + // Scalar types, implicit-lifetime class types ([class.prop]), + // array types, and cv-qualified versions of these types + // are collectively called implicit-lifetime types. + QualType UnqualT = T->getCanonicalTypeUnqualified(); + if (UnqualT->isScalarType()) + return true; + if (UnqualT->isArrayType() || UnqualT->isVectorType()) + return true; + const CXXRecordDecl *RD = UnqualT->getAsCXXRecordDecl(); + if (!RD) + return false; + + // [class.prop] p9 + // A class S is an implicit-lifetime class if + // - it is an aggregate whose destructor is not user-provided or + // - it has at least one trivial eligible constructor and a trivial, + // non-deleted destructor. + const CXXDestructorDecl *Dtor = RD->getDestructor(); + if (UnqualT->isAggregateType()) + if (Dtor && !Dtor->isUserProvided()) + return true; + if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted())) + if (RD->hasTrivialDefaultConstructor() || + RD->hasTrivialCopyConstructor() || RD->hasTrivialMoveConstructor()) + return true; + return false; + } } } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 876921a6b311d4..25585f683752ac 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1079,6 +1079,9 @@ bool Sema::CheckTypeConstraint(TemplateIdAnnotation *TypeConstr) { return true; } + if (CheckConceptUseInDefinition(CD, TypeConstr->TemplateNameLoc)) + return true; + bool WereArgsSpecified = TypeConstr->LAngleLoc.isValid(); if (!WereArgsSpecified && @@ -8447,10 +8450,9 @@ Decl *Sema::ActOnTemplateDeclarator(Scope *S, return NewDecl; } -Decl *Sema::ActOnConceptDefinition( +ConceptDecl *Sema::ActOnStartConceptDefinition( Scope *S, MultiTemplateParamsArg TemplateParameterLists, - const IdentifierInfo *Name, SourceLocation NameLoc, Expr *ConstraintExpr, - const ParsedAttributesView &Attrs) { + const IdentifierInfo *Name, SourceLocation NameLoc) { DeclContext *DC = CurContext; if (!DC->getRedeclContext()->isFileContext()) { @@ -8486,11 +8488,8 @@ Decl *Sema::ActOnConceptDefinition( } } - if (DiagnoseUnexpandedParameterPack(ConstraintExpr)) - return nullptr; - ConceptDecl *NewDecl = - ConceptDecl::Create(Context, DC, NameLoc, Name, Params, ConstraintExpr); + ConceptDecl::Create(Context, DC, NameLoc, Name, Params); if (NewDecl->hasAssociatedConstraints()) { // C++2a [temp.concept]p4: @@ -8499,23 +8498,63 @@ Decl *Sema::ActOnConceptDefinition( NewDecl->setInvalidDecl(); } + DeclarationNameInfo NameInfo(NewDecl->getDeclName(), NewDecl->getBeginLoc()); + LookupResult Previous(*this, NameInfo, LookupOrdinaryName, + forRedeclarationInCurContext()); + LookupName(Previous, S); + FilterLookupForScope(Previous, CurContext, S, /*ConsiderLinkage=*/false, + /*AllowInlineNamespace*/ false); + + // We cannot properly handle redeclarations until we parse the constraint + // expression, so only inject the name if we are sure we are not redeclaring a + // symbol + if (Previous.empty()) + PushOnScopeChains(NewDecl, S, true); + + return NewDecl; +} + +static bool RemoveLookupResult(LookupResult &R, NamedDecl *C) { + bool Found = false; + LookupResult::Filter F = R.makeFilter(); + while (F.hasNext()) { + NamedDecl *D = F.next(); + if (D == C) { + F.erase(); + Found = true; + break; + } + } + F.done(); + return Found; +} + +ConceptDecl * +Sema::ActOnFinishConceptDefinition(Scope *S, ConceptDecl *C, + Expr *ConstraintExpr, + const ParsedAttributesView &Attrs) { + assert(!C->hasDefinition() && "Concept already defined"); + if (DiagnoseUnexpandedParameterPack(ConstraintExpr)) + return nullptr; + C->setDefinition(ConstraintExpr); + ProcessDeclAttributeList(S, C, Attrs); + // Check for conflicting previous declaration. - DeclarationNameInfo NameInfo(NewDecl->getDeclName(), NameLoc); + DeclarationNameInfo NameInfo(C->getDeclName(), C->getBeginLoc()); LookupResult Previous(*this, NameInfo, LookupOrdinaryName, forRedeclarationInCurContext()); LookupName(Previous, S); - FilterLookupForScope(Previous, DC, S, /*ConsiderLinkage=*/false, - /*AllowInlineNamespace*/false); + FilterLookupForScope(Previous, CurContext, S, /*ConsiderLinkage=*/false, + /*AllowInlineNamespace*/ false); + bool WasAlreadyAdded = RemoveLookupResult(Previous, C); bool AddToScope = true; - CheckConceptRedefinition(NewDecl, Previous, AddToScope); + CheckConceptRedefinition(C, Previous, AddToScope); - ActOnDocumentableDecl(NewDecl); - if (AddToScope) - PushOnScopeChains(NewDecl, S); - - ProcessDeclAttributeList(S, NewDecl, Attrs); + ActOnDocumentableDecl(C); + if (!WasAlreadyAdded && AddToScope) + PushOnScopeChains(C, S); - return NewDecl; + return C; } void Sema::CheckConceptRedefinition(ConceptDecl *NewDecl, @@ -8560,6 +8599,16 @@ void Sema::CheckConceptRedefinition(ConceptDecl *NewDecl, Context.setPrimaryMergedDecl(NewDecl, OldConcept->getCanonicalDecl()); } +bool Sema::CheckConceptUseInDefinition(ConceptDecl *Concept, + SourceLocation Loc) { + if (!Concept->isInvalidDecl() && !Concept->hasDefinition()) { + Diag(Loc, diag::err_recursive_concept) << Concept; + Diag(Concept->getLocation(), diag::note_declared_at); + return true; + } + return false; +} + /// \brief Strips various properties off an implicit instantiation /// that has just been explicitly specialized. static void StripImplicitInstantiation(NamedDecl *D, bool MinGW) { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 511e2df7ad3230..fa9b815239dbb6 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1887,6 +1887,7 @@ Token ASTReader::ReadToken(ModuleFile &M, const RecordDataImpl &Record, case tok::annot_pragma_unused: case tok::annot_pragma_openacc: case tok::annot_pragma_openacc_end: + case tok::annot_repl_input_end: break; default: llvm_unreachable("missing deserialization code for annotation token"); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index a33f2a41a65497..8ae07907a04aba 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -1128,6 +1128,7 @@ void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) { (BinaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/6)); bool hasFP_Features = CurrentUnpackingBits->getNextBit(); E->setHasStoredFPFeatures(hasFP_Features); + E->setExcludedOverflowPattern(CurrentUnpackingBits->getNextBit()); E->setLHS(Record.readSubExpr()); E->setRHS(Record.readSubExpr()); E->setOperatorLoc(readSourceLocation()); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 1455f8e4145cb8..5cfb98c2a1060a 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -4734,6 +4734,7 @@ void ASTWriter::AddToken(const Token &Tok, RecordDataImpl &Record) { case tok::annot_pragma_unused: case tok::annot_pragma_openacc: case tok::annot_pragma_openacc_end: + case tok::annot_repl_input_end: break; default: llvm_unreachable("missing serialization code for annotation token"); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 038616a675b727..c292d0a789c7cd 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -1063,6 +1063,7 @@ void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) { CurrentPackingBits.addBits(E->getOpcode(), /*Width=*/6); bool HasFPFeatures = E->hasStoredFPFeatures(); CurrentPackingBits.addBit(HasFPFeatures); + CurrentPackingBits.addBit(E->hasExcludedOverflowPattern()); Record.AddStmt(E->getLHS()); Record.AddStmt(E->getRHS()); Record.AddSourceLocation(E->getOperatorLoc()); diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt index 682cfa01bec963..414282d58f779f 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -63,7 +63,6 @@ add_clang_library(clangStaticAnalyzerCheckers MacOSKeychainAPIChecker.cpp MacOSXAPIChecker.cpp MallocChecker.cpp - MallocOverflowSecurityChecker.cpp MallocSizeofChecker.cpp MismatchedIteratorChecker.cpp MmapWriteExecChecker.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp deleted file mode 100644 index 3c8b38973c6b8c..00000000000000 --- a/clang/lib/StaticAnalyzer/Checkers/MallocOverflowSecurityChecker.cpp +++ /dev/null @@ -1,341 +0,0 @@ -// MallocOverflowSecurityChecker.cpp - Check for malloc overflows -*- C++ -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This checker detects a common memory allocation security flaw. -// Suppose 'unsigned int n' comes from an untrusted source. If the -// code looks like 'malloc (n * 4)', and an attacker can make 'n' be -// say MAX_UINT/4+2, then instead of allocating the correct 'n' 4-byte -// elements, this will actually allocate only two because of overflow. -// Then when the rest of the program attempts to store values past the -// second element, these values will actually overwrite other items in -// the heap, probably allowing the attacker to execute arbitrary code. -// -//===----------------------------------------------------------------------===// - -#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" -#include "clang/AST/EvaluatedExprVisitor.h" -#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h" -#include "clang/StaticAnalyzer/Core/Checker.h" -#include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h" -#include "llvm/ADT/APSInt.h" -#include "llvm/ADT/SmallVector.h" -#include -#include - -using namespace clang; -using namespace ento; -using llvm::APSInt; - -namespace { -struct MallocOverflowCheck { - const CallExpr *call; - const BinaryOperator *mulop; - const Expr *variable; - APSInt maxVal; - - MallocOverflowCheck(const CallExpr *call, const BinaryOperator *m, - const Expr *v, APSInt val) - : call(call), mulop(m), variable(v), maxVal(std::move(val)) {} -}; - -class MallocOverflowSecurityChecker : public Checker { -public: - void checkASTCodeBody(const Decl *D, AnalysisManager &mgr, - BugReporter &BR) const; - - void CheckMallocArgument( - SmallVectorImpl &PossibleMallocOverflows, - const CallExpr *TheCall, ASTContext &Context) const; - - void OutputPossibleOverflows( - SmallVectorImpl &PossibleMallocOverflows, - const Decl *D, BugReporter &BR, AnalysisManager &mgr) const; - -}; -} // end anonymous namespace - -// Return true for computations which evaluate to zero: e.g., mult by 0. -static inline bool EvaluatesToZero(APSInt &Val, BinaryOperatorKind op) { - return (op == BO_Mul) && (Val == 0); -} - -void MallocOverflowSecurityChecker::CheckMallocArgument( - SmallVectorImpl &PossibleMallocOverflows, - const CallExpr *TheCall, ASTContext &Context) const { - - /* Look for a linear combination with a single variable, and at least - one multiplication. - Reject anything that applies to the variable: an explicit cast, - conditional expression, an operation that could reduce the range - of the result, or anything too complicated :-). */ - const Expr *e = TheCall->getArg(0); - const BinaryOperator * mulop = nullptr; - APSInt maxVal; - - for (;;) { - maxVal = 0; - e = e->IgnoreParenImpCasts(); - if (const BinaryOperator *binop = dyn_cast(e)) { - BinaryOperatorKind opc = binop->getOpcode(); - // TODO: ignore multiplications by 1, reject if multiplied by 0. - if (mulop == nullptr && opc == BO_Mul) - mulop = binop; - if (opc != BO_Mul && opc != BO_Add && opc != BO_Sub && opc != BO_Shl) - return; - - const Expr *lhs = binop->getLHS(); - const Expr *rhs = binop->getRHS(); - if (rhs->isEvaluatable(Context)) { - e = lhs; - maxVal = rhs->EvaluateKnownConstInt(Context); - if (EvaluatesToZero(maxVal, opc)) - return; - } else if ((opc == BO_Add || opc == BO_Mul) && - lhs->isEvaluatable(Context)) { - maxVal = lhs->EvaluateKnownConstInt(Context); - if (EvaluatesToZero(maxVal, opc)) - return; - e = rhs; - } else - return; - } else if (isa(e)) - break; - else - return; - } - - if (mulop == nullptr) - return; - - // We've found the right structure of malloc argument, now save - // the data so when the body of the function is completely available - // we can check for comparisons. - - PossibleMallocOverflows.push_back( - MallocOverflowCheck(TheCall, mulop, e, maxVal)); -} - -namespace { -// A worker class for OutputPossibleOverflows. -class CheckOverflowOps : - public EvaluatedExprVisitor { -public: - typedef SmallVectorImpl theVecType; - -private: - theVecType &toScanFor; - ASTContext &Context; - - bool isIntZeroExpr(const Expr *E) const { - if (!E->getType()->isIntegralOrEnumerationType()) - return false; - Expr::EvalResult Result; - if (E->EvaluateAsInt(Result, Context)) - return Result.Val.getInt() == 0; - return false; - } - - static const Decl *getDecl(const DeclRefExpr *DR) { return DR->getDecl(); } - static const Decl *getDecl(const MemberExpr *ME) { - return ME->getMemberDecl(); - } - - template - void Erase(const T1 *DR, - llvm::function_ref Pred) { - auto P = [DR, Pred](const MallocOverflowCheck &Check) { - if (const auto *CheckDR = dyn_cast(Check.variable)) - return getDecl(CheckDR) == getDecl(DR) && Pred(Check); - return false; - }; - llvm::erase_if(toScanFor, P); - } - - void CheckExpr(const Expr *E_p) { - const Expr *E = E_p->IgnoreParenImpCasts(); - const auto PrecedesMalloc = [E, this](const MallocOverflowCheck &c) { - return Context.getSourceManager().isBeforeInTranslationUnit( - E->getExprLoc(), c.call->getExprLoc()); - }; - if (const DeclRefExpr *DR = dyn_cast(E)) - Erase(DR, PrecedesMalloc); - else if (const auto *ME = dyn_cast(E)) { - Erase(ME, PrecedesMalloc); - } - } - - // Check if the argument to malloc is assigned a value - // which cannot cause an overflow. - // e.g., malloc (mul * x) and, - // case 1: mul = - // case 2: mul = a/b, where b > x - void CheckAssignmentExpr(BinaryOperator *AssignEx) { - bool assignKnown = false; - bool numeratorKnown = false, denomKnown = false; - APSInt denomVal; - denomVal = 0; - - // Erase if the multiplicand was assigned a constant value. - const Expr *rhs = AssignEx->getRHS(); - if (rhs->isEvaluatable(Context)) - assignKnown = true; - - // Discard the report if the multiplicand was assigned a value, - // that can never overflow after multiplication. e.g., the assignment - // is a division operator and the denominator is > other multiplicand. - const Expr *rhse = rhs->IgnoreParenImpCasts(); - if (const BinaryOperator *BOp = dyn_cast(rhse)) { - if (BOp->getOpcode() == BO_Div) { - const Expr *denom = BOp->getRHS()->IgnoreParenImpCasts(); - Expr::EvalResult Result; - if (denom->EvaluateAsInt(Result, Context)) { - denomVal = Result.Val.getInt(); - denomKnown = true; - } - const Expr *numerator = BOp->getLHS()->IgnoreParenImpCasts(); - if (numerator->isEvaluatable(Context)) - numeratorKnown = true; - } - } - if (!assignKnown && !denomKnown) - return; - auto denomExtVal = denomVal.getExtValue(); - - // Ignore negative denominator. - if (denomExtVal < 0) - return; - - const Expr *lhs = AssignEx->getLHS(); - const Expr *E = lhs->IgnoreParenImpCasts(); - - auto pred = [assignKnown, numeratorKnown, - denomExtVal](const MallocOverflowCheck &Check) { - return assignKnown || - (numeratorKnown && (denomExtVal >= Check.maxVal.getExtValue())); - }; - - if (const DeclRefExpr *DR = dyn_cast(E)) - Erase(DR, pred); - else if (const auto *ME = dyn_cast(E)) - Erase(ME, pred); - } - - public: - void VisitBinaryOperator(BinaryOperator *E) { - if (E->isComparisonOp()) { - const Expr * lhs = E->getLHS(); - const Expr * rhs = E->getRHS(); - // Ignore comparisons against zero, since they generally don't - // protect against an overflow. - if (!isIntZeroExpr(lhs) && !isIntZeroExpr(rhs)) { - CheckExpr(lhs); - CheckExpr(rhs); - } - } - if (E->isAssignmentOp()) - CheckAssignmentExpr(E); - EvaluatedExprVisitor::VisitBinaryOperator(E); - } - - /* We specifically ignore loop conditions, because they're typically - not error checks. */ - void VisitWhileStmt(WhileStmt *S) { - return this->Visit(S->getBody()); - } - void VisitForStmt(ForStmt *S) { - return this->Visit(S->getBody()); - } - void VisitDoStmt(DoStmt *S) { - return this->Visit(S->getBody()); - } - - CheckOverflowOps(theVecType &v, ASTContext &ctx) - : EvaluatedExprVisitor(ctx), - toScanFor(v), Context(ctx) - { } - }; -} - -// OutputPossibleOverflows - We've found a possible overflow earlier, -// now check whether Body might contain a comparison which might be -// preventing the overflow. -// This doesn't do flow analysis, range analysis, or points-to analysis; it's -// just a dumb "is there a comparison" scan. The aim here is to -// detect the most blatent cases of overflow and educate the -// programmer. -void MallocOverflowSecurityChecker::OutputPossibleOverflows( - SmallVectorImpl &PossibleMallocOverflows, - const Decl *D, BugReporter &BR, AnalysisManager &mgr) const { - // By far the most common case: nothing to check. - if (PossibleMallocOverflows.empty()) - return; - - // Delete any possible overflows which have a comparison. - CheckOverflowOps c(PossibleMallocOverflows, BR.getContext()); - c.Visit(mgr.getAnalysisDeclContext(D)->getBody()); - - // Output warnings for all overflows that are left. - for (const MallocOverflowCheck &Check : PossibleMallocOverflows) { - BR.EmitBasicReport( - D, this, "malloc() size overflow", categories::UnixAPI, - "the computation of the size of the memory allocation may overflow", - PathDiagnosticLocation::createOperatorLoc(Check.mulop, - BR.getSourceManager()), - Check.mulop->getSourceRange()); - } -} - -void MallocOverflowSecurityChecker::checkASTCodeBody(const Decl *D, - AnalysisManager &mgr, - BugReporter &BR) const { - - CFG *cfg = mgr.getCFG(D); - if (!cfg) - return; - - // A list of variables referenced in possibly overflowing malloc operands. - SmallVector PossibleMallocOverflows; - - for (CFG::iterator it = cfg->begin(), ei = cfg->end(); it != ei; ++it) { - CFGBlock *block = *it; - for (CFGBlock::iterator bi = block->begin(), be = block->end(); - bi != be; ++bi) { - if (std::optional CS = bi->getAs()) { - if (const CallExpr *TheCall = dyn_cast(CS->getStmt())) { - // Get the callee. - const FunctionDecl *FD = TheCall->getDirectCallee(); - - if (!FD) - continue; - - // Get the name of the callee. If it's a builtin, strip off the - // prefix. - IdentifierInfo *FnInfo = FD->getIdentifier(); - if (!FnInfo) - continue; - - if (FnInfo->isStr("malloc") || FnInfo->isStr("_MALLOC")) { - if (TheCall->getNumArgs() == 1) - CheckMallocArgument(PossibleMallocOverflows, TheCall, - mgr.getASTContext()); - } - } - } - } - } - - OutputPossibleOverflows(PossibleMallocOverflows, D, BR, mgr); -} - -void ento::registerMallocOverflowSecurityChecker(CheckerManager &mgr) { - mgr.registerChecker(); -} - -bool ento::shouldRegisterMallocOverflowSecurityChecker(const CheckerManager &mgr) { - return true; -} diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp index cf2dfba079ef7e..481e3da9289efa 100644 --- a/clang/test/AST/Interp/cxx11.cpp +++ b/clang/test/AST/Interp/cxx11.cpp @@ -93,49 +93,58 @@ E2 testDefaultArgForParam(E2 e2Param = (E2)-1) { // ok, not a constant expressio void testValueInRangeOfEnumerationValues() { constexpr E1 x1 = static_cast(-8); constexpr E1 x2 = static_cast(8); - // both-error@-1 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}} + // both-error@-1 {{constexpr variable 'x2' must be initialized by a constant expression}} + // both-note@-2 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}} E1 x2b = static_cast(8); // ok, not a constant expression context constexpr E2 x3 = static_cast(-8); - // both-error@-1 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + // both-error@-1 {{constexpr variable 'x3' must be initialized by a constant expression}} + // both-note@-2 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} constexpr E2 x4 = static_cast(0); constexpr E2 x5 = static_cast(8); - // both-error@-1 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + // both-error@-1 {{constexpr variable 'x5' must be initialized by a constant expression}} + // both-note@-2 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} constexpr E3 x6 = static_cast(-2048); constexpr E3 x7 = static_cast(-8); constexpr E3 x8 = static_cast(0); constexpr E3 x9 = static_cast(8); constexpr E3 x10 = static_cast(2048); - // both-error@-1 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}} + // both-error@-1 {{constexpr variable 'x10' must be initialized by a constant expression}} + // both-note@-2 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}} constexpr E4 x11 = static_cast(0); constexpr E4 x12 = static_cast(1); constexpr E4 x13 = static_cast(2); - // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}} + // both-error@-1 {{constexpr variable 'x13' must be initialized by a constant expression}} + // both-note@-2 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}} constexpr EEmpty x14 = static_cast(0); constexpr EEmpty x15 = static_cast(1); constexpr EEmpty x16 = static_cast(2); - // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}} + // both-error@-1 {{constexpr variable 'x16' must be initialized by a constant expression}} + // both-note@-2 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}} constexpr EFixed x17 = static_cast(100); constexpr EScoped x18 = static_cast(100); constexpr EMaxInt x19 = static_cast(__INT_MAX__-1); constexpr EMaxInt x20 = static_cast((long)__INT_MAX__+1); - // both-error@-1 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}} + // both-error@-1 {{constexpr variable 'x20' must be initialized by a constant expression}} + // both-note@-2 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}} const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context } template struct Bitfield { - static constexpr T max = static_cast((1 << size) - 1); // #enum + static constexpr T max = static_cast((1 << size) - 1); + // both-error@-1 {{constexpr variable 'max' must be initialized by a constant expression}} + // both-note@-2 {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} }; void testValueInRangeOfEnumerationValuesViaTemplate() { Bitfield good; - Bitfield bad; // both-error@#enum {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + Bitfield bad; // both-note {{in instantiation}} } enum SortOrder { diff --git a/clang/test/Analysis/malloc-overflow.c b/clang/test/Analysis/malloc-overflow.c deleted file mode 100644 index 03fe15bccb62ee..00000000000000 --- a/clang/test/Analysis/malloc-overflow.c +++ /dev/null @@ -1,150 +0,0 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.security.MallocOverflow -verify %s - -#define NULL ((void *) 0) -typedef __typeof__(sizeof(int)) size_t; -extern void * malloc(size_t); - -void * f1(int n) -{ - return malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -void * f2(int n) -{ - return malloc(sizeof(int) * n); // // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -void * f3(void) -{ - return malloc(4 * sizeof(int)); // no-warning -} - -struct s4 -{ - int n; -}; - -void * f4(struct s4 *s) -{ - return malloc(s->n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -void * f5(struct s4 *s) -{ - struct s4 s2 = *s; - return malloc(s2.n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -void * f6(int n) -{ - return malloc((n + 1) * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -extern void * malloc (size_t); - -void * f7(int n) -{ - if (n > 10) - return NULL; - return malloc(n * sizeof(int)); // no-warning -} - -void * f8(int n) -{ - if (n < 10) - return malloc(n * sizeof(int)); // no-warning - else - return NULL; -} - -void * f9(int n) -{ - int * x = malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} - for (int i = 0; i < n; i++) - x[i] = i; - return x; -} - -void * f10(int n) -{ - int * x = malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} - int i = 0; - while (i < n) - x[i++] = 0; - return x; -} - -void * f11(int n) -{ - int * x = malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} - int i = 0; - do { - x[i++] = 0; - } while (i < n); - return x; -} - -void * f12(int n) -{ - n = (n > 10 ? 10 : n); - int * x = malloc(n * sizeof(int)); // no-warning - for (int i = 0; i < n; i++) - x[i] = i; - return x; -} - -struct s13 -{ - int n; -}; - -void * f13(struct s13 *s) -{ - if (s->n > 10) - return NULL; - return malloc(s->n * sizeof(int)); // no-warning -} - -void * f14(int n) -{ - if (n < 0) - return NULL; - return malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} -} - -void *check_before_malloc(int n, int x) { - int *p = NULL; - if (n > 10) - return NULL; - if (x == 42) - p = malloc(n * sizeof(int)); // no-warning, the check precedes the allocation - - // Do some other stuff, e.g. initialize the memory. - return p; -} - -void *check_after_malloc(int n, int x) { - int *p = NULL; - if (x == 42) - p = malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} - - // The check is after the allocation! - if (n > 10) { - // Do something conditionally. - } - return p; -} - -#define GREATER_THAN(lhs, rhs) (lhs > rhs) -void *check_after_malloc_using_macros(int n, int x) { - int *p = NULL; - if (x == 42) - p = malloc(n * sizeof(int)); // expected-warning {{the computation of the size of the memory allocation may overflow}} - - if (GREATER_THAN(n, 10)) - return NULL; - - // Do some other stuff, e.g. initialize the memory. - return p; -} -#undef GREATER_THAN diff --git a/clang/test/Analysis/malloc-overflow.cpp b/clang/test/Analysis/malloc-overflow.cpp deleted file mode 100644 index e070217cf7d8e2..00000000000000 --- a/clang/test/Analysis/malloc-overflow.cpp +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.security.MallocOverflow -verify %s -// expected-no-diagnostics - -class A { -public: - A& operator<<(const A &a); -}; - -void f() { - A a = A(), b = A(); - a << b; -} diff --git a/clang/test/Analysis/malloc-overflow2.c b/clang/test/Analysis/malloc-overflow2.c deleted file mode 100644 index 7c580602e682ab..00000000000000 --- a/clang/test/Analysis/malloc-overflow2.c +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: %clang_analyze_cc1 -triple x86_64-unknown-unknown -analyzer-checker=alpha.security.MallocOverflow,unix -verify %s -// RUN: %clang_analyze_cc1 -triple x86_64-unknown-unknown -analyzer-checker=alpha.security.MallocOverflow,unix,optin.portability -DPORTABILITY -verify %s - -typedef __typeof__(sizeof(int)) size_t; -extern void *malloc(size_t); -extern void free(void *ptr); - -void *malloc(unsigned long s); - -struct table { - int nentry; - unsigned *table; - unsigned offset_max; -}; - -static int table_build(struct table *t) { - - t->nentry = ((t->offset_max >> 2) + 31) / 32; - t->table = (unsigned *)malloc(sizeof(unsigned) * t->nentry); // expected-warning {{the computation of the size of the memory allocation may overflow}} - - int n; - n = 10000; - int *p = malloc(sizeof(int) * n); // no-warning - - free(p); - return t->nentry; -} - -static int table_build_1(struct table *t) { - t->nentry = (sizeof(struct table) * 2 + 31) / 32; - t->table = (unsigned *)malloc(sizeof(unsigned) * t->nentry); // no-warning - return t->nentry; -} - -void *f(int n) { - return malloc(n * 0 * sizeof(int)); -#ifdef PORTABILITY - // expected-warning@-2{{Call to 'malloc' has an allocation size of 0 bytes}} -#endif -} diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp index 1c0d32fe3fdfce..0d9f5eac23866a 100644 --- a/clang/test/CXX/drs/cwg25xx.cpp +++ b/clang/test/CXX/drs/cwg25xx.cpp @@ -201,7 +201,9 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07 template concept ErrorRequires = requires (ErrorRequires auto x) { - // since-cxx20-error@-1 {{unknown type name 'ErrorRequires'}} + // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \ + // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}} \ + // since-cxx20-note@-1 {{declared here}} x; }; static_assert(ErrorRequires); @@ -209,9 +211,11 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07 // since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}} template - concept NestedErrorInRequires = requires (T x) { + concept NestedErrorInRequires = requires (T x) { // + // since-cxx20-note@-1 {{declared here}} requires requires (NestedErrorInRequires auto y) { - // since-cxx20-error@-1 {{unknown type name 'NestedErrorInRequires'}} + // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \ + // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}} y; }; }; diff --git a/clang/test/CodeGen/X86/cmpccxadd-builtins.c b/clang/test/CodeGen/X86/cmpccxadd-builtins.c index 6daed3a1b17b67..f058dc9b2baa46 100644 --- a/clang/test/CodeGen/X86/cmpccxadd-builtins.c +++ b/clang/test/CodeGen/X86/cmpccxadd-builtins.c @@ -52,50 +52,50 @@ long long test_cmplxadd64(void *__A, long long __B, long long __C) { return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_NB); } -int test_cmpnbexadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpnbexadd32( +int test_cmpaxadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpaxadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 4) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_Z); } -long long test_cmpnbexadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpnbexadd64( +long long test_cmpaxadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpaxadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 4) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_Z); } -int test_cmpnbxadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpnbxadd32( +int test_cmpaexadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpaexadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 5) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_NZ); } -long long test_cmpnbxadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpnbxadd64( +long long test_cmpaexadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpaexadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 5) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_NZ); } -int test_cmpnlexadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpnlexadd32( +int test_cmpgxadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpgxadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 6) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_BE); } -long long test_cmpnlexadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpnlexadd64( +long long test_cmpgxadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpgxadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 6) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_BE); } -int test_cmpnlxadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpnlxadd32( +int test_cmpgexadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpgexadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 7) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_NBE); } -long long test_cmpnlxadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpnlxadd64( +long long test_cmpgexadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpgexadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 7) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_NBE); } @@ -136,14 +136,14 @@ long long test_cmpnsxadd64(void *__A, long long __B, long long __C) { return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_P); } -int test_cmpnzxadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpnzxadd32( +int test_cmpnexadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpnexadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 11) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_NP); } -long long test_cmpnzxadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpnzxadd64( +long long test_cmpnexadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpnexadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 11) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_NP); } @@ -184,14 +184,14 @@ long long test_cmpsxadd64(void *__A, long long __B, long long __C) { return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_LE); } -int test_cmpzxadd32(void *__A, int __B, int __C) { - // CHECK-LABEL: @test_cmpzxadd32( +int test_cmpexadd32(void *__A, int __B, int __C) { + // CHECK-LABEL: @test_cmpexadd32( // CHECK: call i32 @llvm.x86.cmpccxadd32(ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 15) return _cmpccxadd_epi32(__A, __B, __C, _CMPCCX_NLE); } -long long test_cmpzxadd64(void *__A, long long __B, long long __C) { - // CHECK-LABEL: @test_cmpzxadd64( +long long test_cmpexadd64(void *__A, long long __B, long long __C) { + // CHECK-LABEL: @test_cmpexadd64( // CHECK: call i64 @llvm.x86.cmpccxadd64(ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, i32 15) return _cmpccxadd_epi64(__A, __B, __C, _CMPCCX_NLE); } diff --git a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c index 95b7668aca1b0e..6961ee4b88886f 100644 --- a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c +++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c @@ -69,7 +69,7 @@ inline void test_float_arg_inline(float a) {} inline void test_float_arg_inline_used(float a) {} // nofp-hard-opt-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}} void use_inline() { test_float_arg_inline_used(1.0f); } -// nofp-hard-error@-1 {{'use_inline' requires 'float' type support, but ABI 'aapcs' does not support it}} +// nofp-hard-error@-1 {{'test_float_arg_inline_used' requires 'float' type support, but ABI 'aapcs' does not support it}} // The always_inline attribute causes an inline function to always be // code-genned, even at -O0, so we always emit the error. @@ -77,7 +77,7 @@ __attribute((always_inline)) inline void test_float_arg_always_inline_used(float a) {} // nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}} void use_always_inline() { test_float_arg_always_inline_used(1.0f); } -// nofp-hard-error@-1 {{'use_always_inline' requires 'float' type support, but ABI 'aapcs' does not support it}} +// nofp-hard-error@-1 {{'test_float_arg_always_inline_used' requires 'float' type support, but ABI 'aapcs' does not support it}} // Floating-point expressions, global variables and local variables do not // affect the ABI, so are allowed. GCC does reject some uses of floating point @@ -103,9 +103,9 @@ int test_var_double(int a) { extern void extern_float_arg(float); extern float extern_float_ret(void); void call_extern_float_arg() { extern_float_arg(1.0f); } -// nofp-hard-error@-1 {{'call_extern_float_arg' requires 'float' type support, but ABI 'aapcs' does not support it}} +// nofp-hard-error@-1 {{'extern_float_arg' requires 'float' type support, but ABI 'aapcs' does not support it}} void call_extern_float_ret() { extern_float_ret(); } -// nofp-hard-error@-1 {{'call_extern_float_ret' requires 'float' type support, but ABI 'aapcs' does not support it}} +// nofp-hard-error@-1 {{'extern_float_ret' requires 'float' type support, but ABI 'aapcs' does not support it}} // Definitions of variadic functions, and calls to them which only use integer // argument registers, are both fine. @@ -115,7 +115,7 @@ void call_variadic_int() { variadic(0, 1); } // Calls to variadic functions with floating-point arguments are an error, // since this would require floating-point registers. void call_variadic_double() { variadic(0, 1.0); } -// nofp-hard-error@-1 {{'call_variadic_double' requires 'double' type support, but ABI 'aapcs' does not support it}} +// nofp-hard-error@-1 {{'variadic' requires 'double' type support, but ABI 'aapcs' does not support it}} // Calls through function pointers are also diagnosed. void (*fptr)(float); diff --git a/clang/test/CodeGen/overflow-idiom-exclusion-fp.c b/clang/test/CodeGen/overflow-idiom-exclusion-fp.c new file mode 100644 index 00000000000000..511a88cc7a2836 --- /dev/null +++ b/clang/test/CodeGen/overflow-idiom-exclusion-fp.c @@ -0,0 +1,83 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-overflow-pattern-exclusion=all -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-overflow-pattern-exclusion=all -fwrapv -emit-llvm -o - %s | FileCheck %s + +// Check for potential false positives from patterns that _almost_ match classic overflow-dependent or overflow-prone code patterns +extern unsigned a, b, c; +extern int u, v, w; + +extern unsigned some(void); + +// Make sure all these still have handler paths, we shouldn't be excluding +// instrumentation of any "near" patterns. +// CHECK-LABEL: close_but_not_quite +void close_but_not_quite(void) { + // CHECK: br i1{{.*}}handler. + if (a + b > a) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (a - b < a) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (a + b < a) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (a + b + 1 < a) + c = 9; + + // CHECK: br i1{{.*}}handler. + // CHECK: br i1{{.*}}handler. + if (a + b < a + 1) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (b >= a + b) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (a + a < a) + c = 9; + + // CHECK: br i1{{.*}}handler. + if (a + b == a) + c = 9; + + // CHECK: br i1{{.*}}handler + // Although this can never actually overflow we are still checking that the + // sanitizer instruments it. + while (--a) + some(); +} + +// cvise'd kernel code that caused problems during development +typedef unsigned _size_t; +typedef enum { FSE_repeat_none } FSE_repeat; +typedef enum { ZSTD_defaultAllowed } ZSTD_defaultPolicy_e; +FSE_repeat ZSTD_selectEncodingType_repeatMode; +ZSTD_defaultPolicy_e ZSTD_selectEncodingType_isDefaultAllowed; +_size_t ZSTD_NCountCost(void); + +// CHECK-LABEL: ZSTD_selectEncodingType +// CHECK: br i1{{.*}}handler +void ZSTD_selectEncodingType(void) { + _size_t basicCost = + ZSTD_selectEncodingType_isDefaultAllowed ? ZSTD_NCountCost() : 0, + compressedCost = 3 + ZSTD_NCountCost(); + if (basicCost <= compressedCost) + ZSTD_selectEncodingType_repeatMode = FSE_repeat_none; +} + +// CHECK-LABEL: function_calls +void function_calls(void) { + // CHECK: br i1{{.*}}handler + if (some() + b < some()) + c = 9; +} + +// CHECK-LABEL: not_quite_a_negated_unsigned_const +void not_quite_a_negated_unsigned_const(void) { + // CHECK: br i1{{.*}}handler + a = -b; +} diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp new file mode 100644 index 00000000000000..737c9c407f4703 --- /dev/null +++ b/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp @@ -0,0 +1,369 @@ +// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.20 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck %s + +struct StructA {}; + +template +auto AutoT() { return T(); } + +template +const auto AutoConstT() { return T(); } + +template +volatile auto AutoVolatileT() { return T(); } + +template +const volatile auto AutoConstVolatileT() { return T(); } + +// The qualifiers of the return type should always be emitted even for void types. +// Void types usually have their qualifers stripped in the mangled name for MSVC ABI. +void test_template_auto_void() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@X@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBX@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CCX@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CDX@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@X@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@X@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@X@@YA?D_PXZ" +} + +void test_template_auto_int() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@H@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CCH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CDH@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@H@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@H@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@H@@YA?D_PXZ" +} + +void test_template_auto_struct() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@UStructA@@@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBUStructA@@@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@UStructA@@@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@UStructA@@@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@UStructA@@@@YA?D_PXZ" +} + +void test_template_auto_ptr() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@PEAH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@PEBH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@QEBH@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@PEAH@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@PEAH@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@PEAH@@YA?D_PXZ" +} + +template +auto* PtrAutoT() { return T(); } + +template +const auto* PtrAutoConstT() { return T(); } + +template +volatile auto* PtrAutoVolatileT() { return T(); } + +template +const volatile auto* PtrAutoConstVolatileT() { return T(); } + +void test_template_ptr_auto() { + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@PEAH@@YAPEA_PXZ" + + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@PEBH@@YAPEA_PXZ" + + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@QEBH@@YAPEA_PXZ" + + PtrAutoConstT(); + // CHECK: call {{.*}} @"??$PtrAutoConstT@PEAH@@YAPEB_PXZ" + + PtrAutoVolatileT(); + // CHECK: call {{.*}} @"??$PtrAutoVolatileT@PEAH@@YAPEC_PXZ" + + PtrAutoConstVolatileT(); + // CHECK: call {{.*}} @"??$PtrAutoConstVolatileT@PEAH@@YAPED_PXZ" +} + +int func_int(); +const int func_constint(); +void func_void(); +int* func_intptr(); + +template +auto (*FuncPtrAutoT())() { return v; } + +void test_template_func_ptr_auto() { + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6AHXZ$1?func_int@@YAHXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6A?BHXZ$1?func_constint@@YA?BHXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6AXXZ$1?func_void@@YAXXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6APEAHXZ$1?func_intptr@@YAPEAHXZ@@YAP6A?A_PXZXZ" +} + +template +auto& RefAutoT(T& x) { return x; } + +template +const auto& ConstRefAutoT(T& x) { return x; } + +template +auto&& RRefAutoT(T& x) { return static_cast(x); } + +void test_template_ref_auto() { + int x; + + RefAutoT(x); + // CHECK: call {{.*}} @"??$RefAutoT@H@@YAAEA_PAEAH@Z" + + ConstRefAutoT(x); + // CHECK: call {{.*}} @"??$ConstRefAutoT@H@@YAAEB_PAEAH@Z" + + RRefAutoT(x); + // CHECK: call {{.*}} @"??$RRefAutoT@H@@YA$$QEA_PAEAH@Z" +} + +template +decltype(auto) DecltypeAutoT() { return T(); } + +template +decltype(auto) DecltypeAutoT2(T& x) { return static_cast(x); } + +void test_template_decltypeauto() { + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@X@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CBX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CCX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CDX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@H@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CBH@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CCH@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CDH@@YA?A_TXZ" + + int x; + + DecltypeAutoT2(x); + // CHECK: call {{.*}} @"??$DecltypeAutoT2@H@@YA?A_TAEAH@Z" +} + +// Still want to use clang's custom mangling for lambdas to keep backwards compatibility until +// MSVC lambda name mangling has been deciphered. +void test_lambda() { + auto lambdaIntRetAuto = []() { return 0; }; + lambdaIntRetAuto(); + // CHECK: call {{.*}} @"??R@?0??test_lambda@@YAXXZ@QEBA?A?@@XZ" + + auto lambdaIntRet = []() -> int { return 0; }; + lambdaIntRet(); + // CHECK: call {{.*}} @"??R@?0??test_lambda@@YAXXZ@QEBA@XZ" + + auto lambdaGenericIntIntRetAuto = [](auto a) { return a; }; + lambdaGenericIntIntRetAuto(0); + // CHECK: call {{.*}} @"??$?RH@@?0??test_lambda@@YAXXZ@QEBA?A?@@H@Z" +} + +auto TestTrailingInt() -> int { + return 0; +} + +auto TestTrailingConstVolatileVoid() -> const volatile void { +} + +auto TestTrailingStructA() -> StructA { + return StructA{}; +} + +void test_trailing_return() { + TestTrailingInt(); + // CHECK: call {{.*}} @"?TestTrailingInt@@YAHXZ" + + TestTrailingConstVolatileVoid(); + // CHECK: call {{.*}} @"?TestTrailingConstVolatileVoid@@YAXXZ" + + TestTrailingStructA(); + // CHECK: call {{.*}} @"?TestTrailingStructA@@YA?AUStructA@@XZ" +} + +auto TestNonTemplateAutoInt() { + return 0; +} + +auto TestNonTemplateAutoVoid() { + return; +} + +auto TestNonTemplateAutoStructA() { + return StructA{}; +} + +const auto TestNonTemplateConstAutoInt() { + return 0; +} + +const auto TestNonTemplateConstAutoVoid() { + return; +} + +const auto TestNonTemplateConstAutoStructA() { + return StructA{}; +} + +void test_nontemplate_auto() { + TestNonTemplateAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoInt@@YA@XZ" + + TestNonTemplateAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoVoid@@YA@XZ" + + TestNonTemplateAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoStructA@@YA@XZ" + + TestNonTemplateConstAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoInt@@YA@XZ" + + TestNonTemplateConstAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoVoid@@YA@XZ" + + TestNonTemplateConstAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoStructA@@YA@XZ" +} + +decltype(auto) TestNonTemplateDecltypeAutoInt() { + return 0; +} + +decltype(auto) TestNonTemplateDecltypeAutoVoid() { + return; +} + +decltype(auto) TestNonTemplateDecltypeAutoStructA() { + return StructA{}; +} + +void test_nontemplate_decltypeauto() { + TestNonTemplateDecltypeAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoInt@@YA@XZ" + + TestNonTemplateDecltypeAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoVoid@@YA@XZ" + + TestNonTemplateDecltypeAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoStructA@@YA@XZ" +} + +struct StructB { + int x; +}; + +template +auto StructB::* AutoMemberDataPtrT(T x) { return x; } + +template +const auto StructB::* AutoConstMemberDataPtrT(T x) { return x; } + +void test_template_auto_member_data_ptr() { + AutoMemberDataPtrT(&StructB::x); + // CHECK: call {{.*}} @"??$AutoMemberDataPtrT@PEQStructB@@H@@YAPEQStructB@@_PPEQ0@H@Z" + + AutoConstMemberDataPtrT(&StructB::x); + // CHECK: call {{.*}} @"??$AutoConstMemberDataPtrT@PEQStructB@@H@@YAPERStructB@@_PPEQ0@H@Z" +} + +struct StructC { + void test() {} +}; + +struct StructD { + const int test() { return 0; } +}; + +template +auto (StructC::*AutoMemberFuncPtrT(T x))() { return x; } + +template +const auto (StructD::*AutoConstMemberFuncPtrT(T x))() { return x; } + +void test_template_auto_member_func_ptr() { + AutoMemberFuncPtrT(&StructC::test); + // CHECK: call {{.*}} @"??$AutoMemberFuncPtrT@P8StructC@@EAAXXZ@@YAP8StructC@@EAA?A_PXZP80@EAAXXZ@Z" + + AutoConstMemberFuncPtrT(&StructD::test); + // CHECK: call {{.*}} @"??$AutoConstMemberFuncPtrT@P8StructD@@EAA?BHXZ@@YAP8StructD@@EAA?B_PXZP80@EAA?BHXZ@Z" +} + +template +auto * __attribute__((address_space(1))) * AutoPtrAddressSpaceT() { + T * __attribute__((address_space(1))) * p = nullptr; + return p; +} + +void test_template_auto_address_space_ptr() { + AutoPtrAddressSpaceT(); + // CHECK: call {{.*}} @"??$AutoPtrAddressSpaceT@H@@YA?A?@@XZ" +} diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp index 360ebdecc5562b..b7bc3953f0b438 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp @@ -34,15 +34,15 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$I?f@V@@QEAAXXZA@A@@@QEAA@XZ" AutoFunc<&S::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?f@S@@QEAAXXZ@@YA?A?@@XZ" AutoFunc<&M::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$H?f@M@@QEAAXXZA@@@YA?A?@@XZ" AutoFunc<&V::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$I?f@V@@QEAAXXZA@A@@@YA?A?@@XZ" AutoParmTemplate<&S::a> auto_data_single_inheritance; @@ -58,14 +58,14 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$FBA@A@@@QEAA@XZ" AutoFunc<&S::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQS@@H07@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQS@@H07@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$07@@YA?A?@@XZ" AutoFunc<&M::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQM@@H0M@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQM@@H0M@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$0M@@@YA?A?@@XZ" AutoFunc<&V::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQV@@HFBA@A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQV@@HFBA@A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$FBA@A@@@YA?A?@@XZ" } diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp index 8f98c1e59f73d7..251d9219c01ce2 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp @@ -19,6 +19,6 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0A@@@QEAA@XZ" AutoFunc(); - // AFTER: call {{.*}} @"??$AutoFunc@$M$$T0A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$M$$T0A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$0A@@@YA?A?@@XZ" } diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp index ff5395cea75eb7..effcc31ee31103 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp @@ -26,7 +26,7 @@ int j; void template_mangling() { AutoFunc<1>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MH00@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MH00@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$00@@YA?A?@@XZ" AutoParmTemplate<0> auto_int; // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MH0A@@@QEAA@XZ" @@ -52,7 +52,7 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$00$0HPPPPPPPPPPPPPPP@@@QEAA@XZ" AutoFunc<&i>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEAH1?i@@3HA@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEAH1?i@@3HA@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?i@@3HA@@YA?A?@@XZ" AutoParmTemplate<&i> auto_int_ptr; @@ -64,7 +64,7 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$1?i@@3HA$1?j@@3HA@@QEAA@XZ" AutoFunc<&Func>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP6AHXZ1?Func@@YAHXZ@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP6AHXZ1?Func@@YAHXZ@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?Func@@YAHXZ@@YA?A?@@XZ" AutoParmTemplate<&Func> auto_func_ptr; diff --git a/clang/test/Driver/darwin-print-target-triple.c b/clang/test/Driver/darwin-print-target-triple.c new file mode 100644 index 00000000000000..4f5fdfe9d0db34 --- /dev/null +++ b/clang/test/Driver/darwin-print-target-triple.c @@ -0,0 +1,42 @@ +// Test the output of -print-target-triple on Darwin. +// See https://github.com/llvm/llvm-project/issues/61762 + +// +// All platforms +// + +// RUN: %clang -print-target-triple \ +// RUN: --target=x86_64-apple-macos -mmacos-version-min=15 \ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-MACOS %s +// CHECK-CLANGRT-MACOS: x86_64-apple-macosx15.0.0 + +// RUN: %clang -print-target-triple \ +// RUN: --target=arm64-apple-ios -mios-version-min=9 \ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-IOS %s +// CHECK-CLANGRT-IOS: arm64-apple-ios9.0.0 + +// RUN: %clang -print-target-triple \ +// RUN: --target=arm64-apple-watchos -mwatchos-version-min=3 \ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-WATCHOS %s +// CHECK-CLANGRT-WATCHOS: arm64-apple-watchos3.0.0 + +// RUN: %clang -print-target-triple \ +// RUN: --target=armv7k-apple-watchos -mwatchos-version-min=3 \ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-WATCHOS-ARMV7K %s +// CHECK-CLANGRT-WATCHOS-ARMV7K: thumbv7-apple-watchos3.0.0 + +// RUN: %clang -print-target-triple \ +// RUN: --target=arm64-apple-tvos -mtvos-version-min=1\ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-TVOS %s +// CHECK-CLANGRT-TVOS: arm64-apple-tvos1.0.0 + +// RUN: %clang -print-target-triple \ +// RUN: --target=arm64-apple-driverkit \ +// RUN: -resource-dir=%S/Inputs/resource_dir 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-CLANGRT-DRIVERKIT %s +// CHECK-CLANGRT-DRIVERKIT: arm64-apple-driverkit19.0.0 diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c index 5d835d8d6cb2a2..2ef09b699eccb8 100644 --- a/clang/test/Driver/nvlink-wrapper.c +++ b/clang/test/Driver/nvlink-wrapper.c @@ -78,3 +78,10 @@ int baz() { return y + x; } // RUN: --lto-debug-pass-manager --lto-newpm-passes=forceattrs \ // RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=PASSES // PASSES: Running pass: ForceFunctionAttrsPass + +// +// Check that '-plugin` is ingored like in `ld.lld` +// +// RUN: clang-nvlink-wrapper --dry-run %t.o -plugin -arch sm_52 -o a.out \ +// RUN: 2>&1 | FileCheck %s --check-prefix=PLUGIN +// PLUGIN-NOT: -plugin diff --git a/clang/test/Driver/print-enabled-extensions/riscv-rocket-rv64.c b/clang/test/Driver/print-enabled-extensions/riscv-rocket-rv64.c new file mode 100644 index 00000000000000..f8dd58cd74d6db --- /dev/null +++ b/clang/test/Driver/print-enabled-extensions/riscv-rocket-rv64.c @@ -0,0 +1,13 @@ +// REQUIRES: riscv-registered-target +// RUN: %clang --target=riscv64 --print-enabled-extensions -mcpu=rocket-rv64 | FileCheck --strict-whitespace %s + +// Simple litmus test to check the frontend handling of this option is +// enabled. + +// CHECK: Extensions enabled for the given RISC-V target +// CHECK-EMPTY: +// CHECK-NEXT: Name Version Description +// CHECK-NEXT: i 2.1 'I' (Base Integer Instruction Set) +// CHECK-NEXT: zicsr 2.0 'zicsr' (CSRs) +// CHECK-NEXT: zifencei 2.0 'Zifencei' (fence.i) +// CHECK-EMPTY: diff --git a/clang/test/Headers/gpu_disabled_math.cpp b/clang/test/Headers/gpu_disabled_math.cpp new file mode 100644 index 00000000000000..6e697f52120aeb --- /dev/null +++ b/clang/test/Headers/gpu_disabled_math.cpp @@ -0,0 +1,41 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -include __clang_hip_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/Inputs/include \ +// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -target-cpu gfx906 -emit-llvm %s -fcuda-is-device -o - \ +// RUN: -D __CLANG_GPU_DISABLE_MATH_WRAPPERS | FileCheck -check-prefix=AMDGPU %s + +// RUN: %clang_cc1 -include __clang_cuda_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/Inputs/include \ +// RUN: -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-unknown \ +// RUN: -target-cpu sm_90 -emit-llvm %s -fcuda-is-device -o - \ +// RUN: -D __CLANG_GPU_DISABLE_MATH_WRAPPERS | FileCheck -check-prefix=NVPTX %s + +extern "C" double sin(double x); + +// AMDGPU-LABEL: define dso_local noundef double @_Z3food( +// AMDGPU-SAME: double noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca double, align 8, addrspace(5) +// AMDGPU-NEXT: [[X_ADDR:%.*]] = alloca double, align 8, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// AMDGPU-NEXT: store double [[X]], ptr [[X_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP0:%.*]] = load double, ptr [[X_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP1:%.*]] = call double @llvm.sin.f64(double [[TMP0]]) +// AMDGPU-NEXT: ret double [[TMP1]] +// +// NVPTX-LABEL: define dso_local noundef double @_Z3food( +// NVPTX-SAME: double noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[X_ADDR:%.*]] = alloca double, align 8 +// NVPTX-NEXT: store double [[X]], ptr [[X_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load double, ptr [[X_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = call double @llvm.sin.f64(double [[TMP0]]) +// NVPTX-NEXT: ret double [[TMP1]] +// +double foo(double x) { + return sin(x); +} diff --git a/clang/test/Interpreter/delayed-template-parsing-pch.cpp b/clang/test/Interpreter/delayed-template-parsing-pch.cpp new file mode 100644 index 00000000000000..f3bd4649ed0345 --- /dev/null +++ b/clang/test/Interpreter/delayed-template-parsing-pch.cpp @@ -0,0 +1,25 @@ +// Test the setup without incremental extensions first +// RUN: %clang_cc1 -std=c++17 -fdelayed-template-parsing -fpch-instantiate-templates %s -emit-pch -o %t.pch -verify +// RUN: %clang_cc1 -std=c++17 -fdelayed-template-parsing -include-pch %t.pch %s -verify + +// RUN: %clang_cc1 -std=c++17 -fdelayed-template-parsing -fincremental-extensions -fpch-instantiate-templates %s -emit-pch -o %t.incremental.pch -verify +// RUN: %clang_cc1 -std=c++17 -fdelayed-template-parsing -fincremental-extensions -include-pch %t.incremental.pch %s -verify + +// expected-no-diagnostics + +#ifndef PCH +#define PCH + +// Have one template that is instantiated in the PCH (via the passed option +// -fpch-instantiate-templates) and then serialized +template T ft1() { return 0; } +inline int f1() { return ft1(); } + +// Have a second late-parsed template that needs to be deserialized +template T ft2() { return 0; } + +#else + +int f2() { return ft2(); } + +#endif diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index 0f7dcab7c4248d..1a71556213bb16 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -81,6 +81,7 @@ // CHECK-NEXT: FunctionReturnThunks (SubjectMatchRule_function) // CHECK-NEXT: GNUInline (SubjectMatchRule_function) // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable) +// CHECK-NEXT: HLSLROV (SubjectMatchRule_record_not_is_union) // CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_record_not_is_union) // CHECK-NEXT: Hot (SubjectMatchRule_function) // CHECK-NEXT: HybridPatchable (SubjectMatchRule_function) @@ -202,7 +203,7 @@ // CHECK-NEXT: TestTypestate (SubjectMatchRule_function_is_member) // CHECK-NEXT: TrivialABI (SubjectMatchRule_record) // CHECK-NEXT: Uninitialized (SubjectMatchRule_variable_is_local) -// CHECK-NEXT: UnsafeBufferUsage (SubjectMatchRule_function) +// CHECK-NEXT: UnsafeBufferUsage (SubjectMatchRule_function, SubjectMatchRule_field) // CHECK-NEXT: UseHandle (SubjectMatchRule_variable_is_parameter) // CHECK-NEXT: VTablePointerAuthentication (SubjectMatchRule_record) // CHECK-NEXT: VecReturn (SubjectMatchRule_record) diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c index cdbe1e95cba965..35543e6a49ffda 100644 --- a/clang/test/Misc/warning-flags.c +++ b/clang/test/Misc/warning-flags.c @@ -18,14 +18,13 @@ This test serves two purposes: The list of warnings below should NEVER grow. It should gradually shrink to 0. -CHECK: Warnings without flags (65): +CHECK: Warnings without flags (64): CHECK-NEXT: ext_expected_semi_decl_list CHECK-NEXT: ext_missing_whitespace_after_macro_name CHECK-NEXT: ext_new_paren_array_nonconst CHECK-NEXT: ext_plain_complex CHECK-NEXT: ext_template_arg_extra_parens -CHECK-NEXT: ext_template_spec_extra_headers CHECK-NEXT: ext_typecheck_cond_incompatible_operands CHECK-NEXT: ext_typecheck_ordered_comparison_of_pointer_integer CHECK-NEXT: ext_using_undefined_std diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl new file mode 100644 index 00000000000000..29850828ad3bc2 --- /dev/null +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s + + +// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} +struct [[hlsl::is_rov]] Eg1 { + int i; +}; + +Eg1 e1; diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl new file mode 100644 index 00000000000000..a21fed22220b6d --- /dev/null +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify + +// expected-error@+1{{'is_rov' attribute takes no arguments}} +struct [[hlsl::is_rov(3)]] Eg1 { + int i; +}; + +Eg1 e1; + +// expected-error@+1{{use of undeclared identifier 'gibberish'}} +struct [[hlsl::is_rov(gibberish)]] Eg2 { + int i; +}; + +Eg2 e2; diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index 6b7bcbc35b8f89..320d1160e761dd 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -1,14 +1,15 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s -// CHECK: -ClassTemplateDecl 0x{{[0-9a-f]+}} <> implicit RWBuffer -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <> implicit class RWBuffer definition -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h 'element_type *' -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit UAV -// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer -RasterizerOrderedBuffer > BufferArray3[4] : register(u4, space1); - // CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <> class RWBuffer definition implicit_instantiation // CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'float *' // CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit UAV // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer RWBuffer Buffer1; + +// CHECK: -ClassTemplateDecl 0x{{[0-9a-f]+}} <> implicit RasterizerOrderedBuffer +// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} <> implicit class RasterizerOrderedBuffer definition +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit h 'element_type *' +// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit UAV +// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer +// CHECK: -HLSLROVAttr 0x{{[0-9a-f]+}} <> Implicit +RasterizerOrderedBuffer > BufferArray3[4] : register(u4, space1); diff --git a/clang/test/SemaCXX/Inputs/enum-constexpr-conversion-system-header.h b/clang/test/SemaCXX/Inputs/enum-constexpr-conversion-system-header.h index 0850f3405eed3a..7d35d5b71d1dfe 100644 --- a/clang/test/SemaCXX/Inputs/enum-constexpr-conversion-system-header.h +++ b/clang/test/SemaCXX/Inputs/enum-constexpr-conversion-system-header.h @@ -10,7 +10,8 @@ enum SystemEnum void testValueInRangeOfEnumerationValuesInSystemHeader() { constexpr SystemEnum x1 = static_cast(123); - // expected-error@-1 {{integer value 123 is outside the valid range of values [0, 1] for the enumeration type 'SystemEnum'}} + // expected-error@-1 {{constexpr variable 'x1' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 123 is outside the valid range of values [0, 1] for the enumeration type 'SystemEnum'}} const SystemEnum x2 = static_cast(123); // ok, not a constant expression context } diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp index 6df8a4740d6ccc..d888887bd8c6f3 100644 --- a/clang/test/SemaCXX/constant-expression-cxx11.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp @@ -2460,52 +2460,62 @@ E2 testDefaultArgForParam(E2 e2Param = (E2)-1) { // ok, not a constant expressio void testValueInRangeOfEnumerationValues() { constexpr E1 x1 = static_cast(-8); constexpr E1 x2 = static_cast(8); - // expected-error@-1 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}} + // expected-error@-1 {{constexpr variable 'x2' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}} E1 x2b = static_cast(8); // ok, not a constant expression context constexpr E2 x3 = static_cast(-8); - // expected-error@-1 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + // expected-error@-1 {{constexpr variable 'x3' must be initialized by a constant expression}} + // expected-note@-2 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} constexpr E2 x4 = static_cast(0); constexpr E2 x5 = static_cast(8); - // expected-error@-1 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + // expected-error@-1 {{constexpr variable 'x5' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} constexpr E3 x6 = static_cast(-2048); constexpr E3 x7 = static_cast(-8); constexpr E3 x8 = static_cast(0); constexpr E3 x9 = static_cast(8); constexpr E3 x10 = static_cast(2048); - // expected-error@-1 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}} + // expected-error@-1 {{constexpr variable 'x10' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}} constexpr E4 x11 = static_cast(0); constexpr E4 x12 = static_cast(1); constexpr E4 x13 = static_cast(2); - // expected-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}} + // expected-error@-1 {{constexpr variable 'x13' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}} constexpr EEmpty x14 = static_cast(0); constexpr EEmpty x15 = static_cast(1); constexpr EEmpty x16 = static_cast(2); - // expected-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}} + // expected-error@-1 {{constexpr variable 'x16' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}} constexpr EFixed x17 = static_cast(100); constexpr EScoped x18 = static_cast(100); constexpr EMaxInt x19 = static_cast(__INT_MAX__-1); constexpr EMaxInt x20 = static_cast((long)__INT_MAX__+1); - // expected-error@-1 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}} + // expected-error@-1 {{constexpr variable 'x20' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}} const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context CONSTEXPR_CAST_TO_SYSTEM_ENUM_OUTSIDE_OF_RANGE; - // expected-error@-1 {{integer value 123 is outside the valid range of values [0, 1] for the enumeration type 'SystemEnum'}} + // expected-error@-1 {{constexpr variable 'system_enum' must be initialized by a constant expression}} + // expected-note@-2 {{integer value 123 is outside the valid range of values [0, 1] for the enumeration type 'SystemEnum'}} } template struct Bitfield { - static constexpr T max = static_cast((1 << size) - 1); // #enum + static constexpr T max = static_cast((1 << size) - 1); + // cxx11-error@-1 {{constexpr variable 'max' must be initialized by a constant expression}} + // cxx11-note@-2 {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} }; void testValueInRangeOfEnumerationValuesViaTemplate() { Bitfield good; - Bitfield bad; // cxx11-error@#enum {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + Bitfield bad; // cxx11-note {{in instantiation}} } enum SortOrder { @@ -2526,4 +2536,5 @@ void A::f(SortOrder order) { GH50055::E2 GlobalInitNotCE1 = (GH50055::E2)-1; // ok, not a constant expression context GH50055::E2 GlobalInitNotCE2 = GH50055::testDefaultArgForParam(); // ok, not a constant expression context constexpr GH50055::E2 GlobalInitCE = (GH50055::E2)-1; -// expected-error@-1 {{integer value -1 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} +// expected-error@-1 {{constexpr variable 'GlobalInitCE' must be initialized by a constant expression}} +// expected-note@-2 {{integer value -1 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp index ba80e57f814244..6b0609a26c5882 100644 --- a/clang/test/SemaCXX/cxx2a-consteval.cpp +++ b/clang/test/SemaCXX/cxx2a-consteval.cpp @@ -920,12 +920,13 @@ consteval int aConstevalFunction() { // expected-error {{consteval function neve namespace GH50055 { enum E {e1=0, e2=1}; consteval int testDefaultArgForParam(E eParam = (E)-1) { -// expected-error@-1 {{integer value -1 is outside the valid range of values [0, 1] for the enumeration type 'E'}} +// expected-note@-1 {{integer value -1 is outside the valid range of values [0, 1] for the enumeration type 'E'}} return (int)eParam; } int test() { return testDefaultArgForParam() + testDefaultArgForParam((E)1); + // expected-error@-1 {{call to consteval function 'GH50055::testDefaultArgForParam' is not a constant expression}} } } diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index e5d2ced3caaed5..bf069d9bc082c3 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -18,7 +18,7 @@ enum class SignedEnumClass : signed int {}; enum class UnsignedEnumClass : unsigned int {}; struct POD { Enum e; int i; float f; NonPOD* p; }; struct Empty {}; -struct IncompleteStruct; +struct IncompleteStruct; // expected-note {{forward declaration of 'IncompleteStruct'}} typedef Empty EmptyAr[10]; typedef Empty EmptyArNB[]; typedef Empty EmptyArMB[1][2]; @@ -1908,6 +1908,162 @@ void is_pointer_interconvertible_base_of(int n) } } +struct NoEligibleTrivialContructor { + NoEligibleTrivialContructor() {}; + NoEligibleTrivialContructor(const NoEligibleTrivialContructor&) {} + NoEligibleTrivialContructor(NoEligibleTrivialContructor&&) {} +}; + +struct OnlyDefaultConstructorIsTrivial { + OnlyDefaultConstructorIsTrivial() = default; + OnlyDefaultConstructorIsTrivial(const OnlyDefaultConstructorIsTrivial&) {} + OnlyDefaultConstructorIsTrivial(OnlyDefaultConstructorIsTrivial&&) {} +}; + +struct AllContstructorsAreTrivial { + AllContstructorsAreTrivial() = default; + AllContstructorsAreTrivial(const AllContstructorsAreTrivial&) = default; + AllContstructorsAreTrivial(AllContstructorsAreTrivial&&) = default; +}; + +struct InheritedNoEligibleTrivialConstructor : NoEligibleTrivialContructor { + using NoEligibleTrivialContructor::NoEligibleTrivialContructor; +}; + +struct InheritedOnlyDefaultConstructorIsTrivial : OnlyDefaultConstructorIsTrivial { + using OnlyDefaultConstructorIsTrivial::OnlyDefaultConstructorIsTrivial; +}; + +struct InheritedAllContstructorsAreTrivial : AllContstructorsAreTrivial { + using AllContstructorsAreTrivial::AllContstructorsAreTrivial; +}; + +struct UserDeclaredDestructor { + ~UserDeclaredDestructor() = default; +}; + +struct UserProvidedDestructor { + ~UserProvidedDestructor() {} +}; + +struct UserDeletedDestructorInAggregate { + ~UserDeletedDestructorInAggregate() = delete; +}; + +struct UserDeletedDestructorInNonAggregate { + virtual void NonAggregate(); + ~UserDeletedDestructorInNonAggregate() = delete; +}; + +struct DeletedDestructorViaBaseInAggregate : UserDeletedDestructorInAggregate {}; +struct DeletedDestructorViaBaseInNonAggregate : UserDeletedDestructorInNonAggregate {}; + +#if __cplusplus >= 202002L +template +struct ConstrainedUserDeclaredDefaultConstructor{ + ConstrainedUserDeclaredDefaultConstructor() requires B = default; + ConstrainedUserDeclaredDefaultConstructor(const ConstrainedUserDeclaredDefaultConstructor&) {} +}; + +template +struct ConstrainedUserProvidedDestructor { + ~ConstrainedUserProvidedDestructor() = default; + ~ConstrainedUserProvidedDestructor() requires B {} +}; +#endif + +struct StructWithFAM { + int a[]; +}; + +struct StructWithZeroSizedArray { + int a[0]; +}; + +typedef float float4 __attribute__((ext_vector_type(4))); +typedef int *align_value_int __attribute__((align_value(16))); + +struct [[clang::enforce_read_only_placement]] EnforceReadOnlyPlacement {}; +struct [[clang::type_visibility("hidden")]] TypeVisibility {}; + +void is_implicit_lifetime(int n) { + static_assert(__builtin_is_implicit_lifetime(decltype(nullptr))); + static_assert(!__builtin_is_implicit_lifetime(void)); + static_assert(!__builtin_is_implicit_lifetime(const void)); + static_assert(!__builtin_is_implicit_lifetime(volatile void)); + static_assert(__builtin_is_implicit_lifetime(int)); + static_assert(!__builtin_is_implicit_lifetime(int&)); + static_assert(!__builtin_is_implicit_lifetime(int&&)); + static_assert(__builtin_is_implicit_lifetime(float)); + static_assert(__builtin_is_implicit_lifetime(double)); + static_assert(__builtin_is_implicit_lifetime(long double)); + static_assert(__builtin_is_implicit_lifetime(int*)); + static_assert(__builtin_is_implicit_lifetime(int[])); + static_assert(__builtin_is_implicit_lifetime(int[5])); + static_assert(__builtin_is_implicit_lifetime(int[n])); + // expected-error@-1 {{variable length arrays are not supported in '__builtin_is_implicit_lifetime'}} + static_assert(__builtin_is_implicit_lifetime(Enum)); + static_assert(__builtin_is_implicit_lifetime(EnumClass)); + static_assert(!__builtin_is_implicit_lifetime(void())); + static_assert(!__builtin_is_implicit_lifetime(void() &)); + static_assert(!__builtin_is_implicit_lifetime(void() const)); + static_assert(!__builtin_is_implicit_lifetime(void(&)())); + static_assert(__builtin_is_implicit_lifetime(void(*)())); + static_assert(__builtin_is_implicit_lifetime(decltype(nullptr))); + static_assert(__builtin_is_implicit_lifetime(int UserDeclaredDestructor::*)); + static_assert(__builtin_is_implicit_lifetime(int (UserDeclaredDestructor::*)())); + static_assert(__builtin_is_implicit_lifetime(int (UserDeclaredDestructor::*)() const)); + static_assert(__builtin_is_implicit_lifetime(int (UserDeclaredDestructor::*)() &)); + static_assert(__builtin_is_implicit_lifetime(int (UserDeclaredDestructor::*)() &&)); + static_assert(!__builtin_is_implicit_lifetime(IncompleteStruct)); + // expected-error@-1 {{incomplete type 'IncompleteStruct' used in type trait expression}} + static_assert(__builtin_is_implicit_lifetime(IncompleteStruct[])); + static_assert(__builtin_is_implicit_lifetime(IncompleteStruct[5])); + static_assert(__builtin_is_implicit_lifetime(UserDeclaredDestructor)); + static_assert(__builtin_is_implicit_lifetime(const UserDeclaredDestructor)); + static_assert(__builtin_is_implicit_lifetime(volatile UserDeclaredDestructor)); + static_assert(!__builtin_is_implicit_lifetime(UserProvidedDestructor)); + static_assert(!__builtin_is_implicit_lifetime(NoEligibleTrivialContructor)); + static_assert(__builtin_is_implicit_lifetime(OnlyDefaultConstructorIsTrivial)); + static_assert(__builtin_is_implicit_lifetime(AllContstructorsAreTrivial)); + static_assert(!__builtin_is_implicit_lifetime(InheritedNoEligibleTrivialConstructor)); + static_assert(__builtin_is_implicit_lifetime(InheritedOnlyDefaultConstructorIsTrivial)); + static_assert(__builtin_is_implicit_lifetime(InheritedAllContstructorsAreTrivial)); + static_assert(__builtin_is_implicit_lifetime(UserDeletedDestructorInAggregate)); + static_assert(!__builtin_is_implicit_lifetime(UserDeletedDestructorInNonAggregate)); + static_assert(__builtin_is_implicit_lifetime(DeletedDestructorViaBaseInAggregate) == __cplusplus >= 201703L); + static_assert(!__builtin_is_implicit_lifetime(DeletedDestructorViaBaseInNonAggregate)); +#if __cplusplus >= 202002L + static_assert(__builtin_is_implicit_lifetime(ConstrainedUserDeclaredDefaultConstructor)); + static_assert(!__builtin_is_implicit_lifetime(ConstrainedUserDeclaredDefaultConstructor)); + static_assert(!__builtin_is_implicit_lifetime(ConstrainedUserProvidedDestructor)); + static_assert(__builtin_is_implicit_lifetime(ConstrainedUserProvidedDestructor)); +#endif + + static_assert(__builtin_is_implicit_lifetime(__int128)); + static_assert(__builtin_is_implicit_lifetime(_BitInt(8))); + static_assert(__builtin_is_implicit_lifetime(_BitInt(128))); + static_assert(__builtin_is_implicit_lifetime(int[0])); + static_assert(__builtin_is_implicit_lifetime(StructWithFAM)); + static_assert(__builtin_is_implicit_lifetime(StructWithZeroSizedArray)); + static_assert(__builtin_is_implicit_lifetime(__fp16)); + static_assert(__builtin_is_implicit_lifetime(__bf16)); + static_assert(__builtin_is_implicit_lifetime(_Complex double)); + static_assert(__builtin_is_implicit_lifetime(float4)); + static_assert(__builtin_is_implicit_lifetime(align_value_int)); + static_assert(__builtin_is_implicit_lifetime(int[[clang::annotate_type("category2")]] *)); + static_assert(__builtin_is_implicit_lifetime(int __attribute__((btf_type_tag("user"))) *)); + static_assert(__builtin_is_implicit_lifetime(EnforceReadOnlyPlacement)); + static_assert(__builtin_is_implicit_lifetime(int __attribute__((noderef)) *)); + static_assert(__builtin_is_implicit_lifetime(TypeVisibility)); + static_assert(__builtin_is_implicit_lifetime(int * _Nonnull)); + static_assert(__builtin_is_implicit_lifetime(int * _Null_unspecified)); + static_assert(__builtin_is_implicit_lifetime(int * _Nullable)); + static_assert(!__builtin_is_implicit_lifetime(_Atomic int)); + // expected-error@-1 {{atomic types are not supported in '__builtin_is_implicit_lifetime'}} + static_assert(__builtin_is_implicit_lifetime(int * __restrict)); +} + void is_signed() { //static_assert(__is_signed(char)); diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp new file mode 100644 index 00000000000000..0ba605475925b9 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-field-attr.cpp @@ -0,0 +1,190 @@ +// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage \ +// RUN: -fsafe-buffer-usage-suggestions -verify %s + +using size_t = __typeof(sizeof(int)); + +namespace std { + class type_info; + class bad_cast; + class bad_typeid; + + template class span { + + private: + T *elements; + size_t size_; + + public: + span(T *, size_t){} + + constexpr T* data() const noexcept { + return elements; + } + + constexpr size_t size() const noexcept { + return size_; + } + + }; +} + +struct A { + [[clang::unsafe_buffer_usage]] + int *ptr; + + size_t sz; +}; + +struct B { + A a; + + [[clang::unsafe_buffer_usage]] + int buf[]; +}; + +struct D { + [[clang::unsafe_buffer_usage]] + int *ptr, *ptr2; + + [[clang::unsafe_buffer_usage]] + int buf[10]; + + size_t sz; + +}; + +void foo(int *ptr); + +void foo_safe(std::span sp); + +int* test_atribute_struct(A a) { + int b = *(a.ptr); //expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} + a.sz++; + // expected-warning@+1{{unsafe pointer arithmetic}} + return a.ptr++; //expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} +} + +void test_attribute_field_deref_chain(B b) { + int *ptr = b.a.ptr;//expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} + foo(b.buf); //expected-warning{{field 'buf' prone to unsafe buffer manipulation}} +} + +void test_writes_from_span(std::span sp) { + A a; + a.ptr = sp.data(); //expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} + a.sz = sp.size(); + + a.ptr = nullptr; // expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} +} + +void test_reads_to_span(A a, A b) { + //expected-warning@+1{{the two-parameter std::span construction is unsafe as it can introduce mismatch between buffer size and the bound information}} + std::span sp {a.ptr, a.sz}; //expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} + + // expected-warning@+1 3{{field 'ptr' prone to unsafe buffer manipulation}} + if(a.ptr != nullptr && a.ptr != b.ptr) { + foo_safe(sp); + } + +} + +void test_attribute_multiple_fields (D d) { + int *p =d.ptr; //expected-warning{{field 'ptr' prone to unsafe buffer manipulation}} + p = d.ptr2; //expected-warning{{field 'ptr2' prone to unsafe buffer manipulation}} + + p = d.buf; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}} + + int v = d.buf[0]; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}} + + //expected-warning@+1{{unsafe buffer access}} + v = d.buf[5]; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}} +} + +template +struct TemplateArray { + [[clang::unsafe_buffer_usage]] + T *buf; + + [[clang::unsafe_buffer_usage]] + size_t sz; +}; + + +void test_struct_template (TemplateArray t) { + int *p = t.buf; //expected-warning{{field 'buf' prone to unsafe buffer manipulation}} + size_t s = t.sz; //expected-warning{{field 'sz' prone to unsafe buffer manipulation}} +} + +class R { + [[clang::unsafe_buffer_usage]] + int *array; + + public: + int* getArray() { + return array; //expected-warning{{field 'array' prone to unsafe buffer manipulation}} + } + + void setArray(int *arr) { + array = arr; //expected-warning{{field 'array' prone to unsafe buffer manipulation}} + } +}; + +template +class Q { + [[clang::unsafe_buffer_usage]] + P *array; + + public: + P* getArray() { + return array; //expected-warning{{field 'array' prone to unsafe buffer manipulation}} + } + + void setArray(P *arr) { + array = arr; //expected-warning{{field 'array' prone to unsafe buffer manipulation}} + } +}; + +void test_class_template(Q q) { + q.getArray(); + q.setArray(nullptr); +} + +struct AnonSFields { + struct { + [[clang::unsafe_buffer_usage]] + int a; + }; +}; + +void test_anon_struct_fields(AnonSFields anon) { + int val = anon.a; //expected-warning{{field 'a' prone to unsafe buffer manipulation}} +} + +union Union { + [[clang::unsafe_buffer_usage]] + int *ptr1; + + int ptr2; +}; + +struct C { + Union ptr; +}; + +void test_attribute_union(C c) { + int *p = c.ptr.ptr1; //expected-warning{{field 'ptr1' prone to unsafe buffer manipulation}} + + int address = c.ptr.ptr2; +} + +struct AnonFields2 { + [[clang::unsafe_buffer_usage]] + struct { + int a; + }; +}; + +void test_anon_struct(AnonFields2 af) { + int val = af.a; // No warning here, as the attribute is not explicitly attached to field 'a' + val++; +} diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl index c72c8b3c222b6b..fc48c9b2589f7e 100644 --- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected void test_too_few_arg() { diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp index a4b42cad79abd4..a98ca3939222bd 100644 --- a/clang/test/SemaTemplate/concepts.cpp +++ b/clang/test/SemaTemplate/concepts.cpp @@ -1006,7 +1006,14 @@ template concept Irrelevant = false; template -concept ErrorRequires = requires(ErrorRequires auto x) { x; }; // expected-error {{unknown type name 'ErrorRequires'}} +concept ErrorRequires = requires(ErrorRequires auto x) { x; }; +// expected-error@-1 {{a concept definition cannot refer to itself}} \ +// expected-error@-1 {{'auto' not allowed in requires expression parameter}} \ +// expected-note@-1 {{declared here}} + +template concept C1 = C1 && [](C1 auto) -> C1 auto {}; +//expected-error@-1 4{{a concept definition cannot refer to itself}} \ +//expected-note@-1 4{{declared here}} template void aaa(T t) // expected-note {{candidate template ignored: constraints not satisfied}} requires (False || False) || False {} // expected-note 3 {{'int' does not satisfy 'False'}} diff --git a/clang/test/SemaTemplate/temp_explicit.cpp b/clang/test/SemaTemplate/temp_explicit.cpp index 0bb0cfad61fdb0..4612e4a57e90e0 100644 --- a/clang/test/SemaTemplate/temp_explicit.cpp +++ b/clang/test/SemaTemplate/temp_explicit.cpp @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -Wc++11-compat %s // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -Wc++11-compat -std=c++98 %s // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++11 %s +// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++20 %s // // Tests explicit instantiation of templates. template class X0 { }; @@ -128,11 +129,15 @@ struct Foo // expected-note{{header not required for explicitly-specialized {}; }; -template <> // expected-warning{{extraneous template parameter list}} +template <> // expected-error{{extraneous template parameter list}} template <> struct Foo::Bar {}; +#if __cplusplus >= 202002L +template<> void f(auto); // expected-error{{extraneous template parameter list}} +#endif + namespace N1 { template struct X7 { }; // expected-note{{here}} diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td index a97190b7a614cc..ef1a7542e49502 100644 --- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td +++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td @@ -39,6 +39,9 @@ def library_S : Separate<["--", "-"], "library">, Flags<[HelpHidden]>, def library_EQ : Joined<["--", "-"], "library=">, Flags<[HelpHidden]>, Alias; +def plugin : Joined<["--", "-"], "plugin">, + Flags<[HelpHidden, WrapperOnlyOption]>; + def arch : Separate<["--", "-"], "arch">, HelpText<"Specify the 'sm_' name of the target architecture.">; def : Joined<["--", "-"], "plugin-opt=mcpu=">, diff --git a/clang/www/analyzer/potential_checkers.html b/clang/www/analyzer/potential_checkers.html index ee9ba164387f34..ad789b83e71b71 100644 --- a/clang/www/analyzer/potential_checkers.html +++ b/clang/www/analyzer/potential_checkers.html @@ -90,8 +90,6 @@

memory

memory.NegativeArraySize (C, C++)
'n' is used to specify the buffer size may be negative. -
Note: possibly an enhancement to -alpha.security.MallocOverflow.

Source: CWE-20, Example 2.

diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp index 9e66f77217ec6b..e041861edaf0b7 100644 --- a/compiler-rt/lib/asan/asan_allocator.cpp +++ b/compiler-rt/lib/asan/asan_allocator.cpp @@ -576,15 +576,8 @@ struct Allocator { } AsanThread *t = GetCurrentThread(); - void *allocated; - if (t) { - AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage()); - allocated = allocator.Allocate(cache, needed_size, 8); - } else { - SpinMutexLock l(&fallback_mutex); - AllocatorCache *cache = &fallback_allocator_cache; - allocated = allocator.Allocate(cache, needed_size, 8); - } + void *allocated = allocator.Allocate( + GetAllocatorCache(&t->malloc_storage()), needed_size, 8); if (UNLIKELY(!allocated)) { SetAllocatorOutOfMemory(); if (AllocatorMayReturnNull()) diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp index d7d4967c949859..f478b9979f2daa 100644 --- a/compiler-rt/lib/msan/msan_allocator.cpp +++ b/compiler-rt/lib/msan/msan_allocator.cpp @@ -199,15 +199,8 @@ static void *MsanAllocate(BufferedStackTrace *stack, uptr size, uptr alignment, ReportRssLimitExceeded(stack); } MsanThread *t = GetCurrentThread(); - void *allocated; - if (t) { - AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage()); - allocated = allocator.Allocate(cache, size, alignment); - } else { - SpinMutexLock l(&fallback_mutex); - AllocatorCache *cache = &fallback_allocator_cache; - allocated = allocator.Allocate(cache, size, alignment); - } + void *allocated = allocator.Allocate(GetAllocatorCache(&t->malloc_storage()), + size, alignment); if (UNLIKELY(!allocated)) { SetAllocatorOutOfMemory(); if (AllocatorMayReturnNull()) diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index e3dc8d4ef8247c..27f8697db7838f 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -184,14 +184,6 @@ template class NonZeroLengthArray { template class MapAllocatorCache { public: - typedef enum { COMMITTED = 0, DECOMMITTED = 1, NONE } EntryListT; - - // TODO: Refactor the intrusive list to support non-pointer link type - typedef struct { - u16 Head; - u16 Tail; - } ListInfo; - void getStats(ScopedString *Str) { ScopedLock L(Mutex); uptr Integral; @@ -209,18 +201,13 @@ class MapAllocatorCache { SuccessfulRetrieves, CallsToRetrieve, Integral, Fractional); Str->append("Cache Entry Info (Most Recent -> Least Recent):\n"); - auto printList = [&](EntryListT ListType) REQUIRES(Mutex) { - for (u32 I = EntryLists[ListType].Head; I != CachedBlock::InvalidEntry; - I = Entries[I].Next) { - CachedBlock &Entry = Entries[I]; - Str->append(" StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, " - "BlockSize: %zu %s\n", - Entry.CommitBase, Entry.CommitBase + Entry.CommitSize, - Entry.CommitSize, Entry.Time == 0 ? "[R]" : ""); - } - }; - printList(COMMITTED); - printList(DECOMMITTED); + for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) { + CachedBlock &Entry = Entries[I]; + Str->append(" StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, " + "BlockSize: %zu %s\n", + Entry.CommitBase, Entry.CommitBase + Entry.CommitSize, + Entry.CommitSize, Entry.Time == 0 ? "[R]" : ""); + } } // Ensure the default maximum specified fits the array. @@ -244,10 +231,8 @@ class MapAllocatorCache { setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); // The cache is initially empty - EntryLists[COMMITTED].Head = CachedBlock::InvalidEntry; - EntryLists[COMMITTED].Tail = CachedBlock::InvalidEntry; - EntryLists[DECOMMITTED].Head = CachedBlock::InvalidEntry; - EntryLists[DECOMMITTED].Tail = CachedBlock::InvalidEntry; + LRUHead = CachedBlock::InvalidEntry; + LRUTail = CachedBlock::InvalidEntry; // Available entries will be retrieved starting from the beginning of the // Entries array @@ -265,6 +250,7 @@ class MapAllocatorCache { const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs); u64 Time; CachedBlock Entry; + Entry.CommitBase = CommitBase; Entry.CommitSize = CommitSize; Entry.BlockBegin = BlockBegin; @@ -326,27 +312,18 @@ class MapAllocatorCache { Entry = PrevEntry; } - // All excess entries are evicted from the cache. - // DECOMMITTED entries, being older than the COMMITTED - // entries, are evicted first in least recently used (LRU) - // fashioned followed by the COMMITTED entries + // All excess entries are evicted from the cache while (needToEvict()) { - EntryListT EvictionListType; - if (EntryLists[DECOMMITTED].Tail == CachedBlock::InvalidEntry) - EvictionListType = COMMITTED; - else - EvictionListType = DECOMMITTED; // Save MemMaps of evicted entries to perform unmap outside of lock - EvictionMemMaps.push_back( - Entries[EntryLists[EvictionListType].Tail].MemMap); - remove(EntryLists[EvictionListType].Tail, EvictionListType); + EvictionMemMaps.push_back(Entries[LRUTail].MemMap); + remove(LRUTail); } - insert(Entry, (Entry.Time == 0) ? DECOMMITTED : COMMITTED); + insert(Entry); if (OldestTime == 0) OldestTime = Entry.Time; - } while (0); // ScopedLock L(Mutex); + } while (0); for (MemMapT &EvictMemMap : EvictionMemMaps) unmapCallBack(EvictMemMap); @@ -363,14 +340,17 @@ class MapAllocatorCache { // 10% of the requested size proved to be the optimal choice for // retrieving cached blocks after testing several options. constexpr u32 FragmentedBytesDivisor = 10; + bool Found = false; CachedBlock Entry; - uptr OptimalFitIndex = CachedBlock::InvalidEntry; - uptr MinDiff = UINTPTR_MAX; - EntryListT OptimalFitListType = NONE; EntryHeaderPos = 0; - - auto FindAvailableEntry = [&](EntryListT ListType) REQUIRES(Mutex) { - for (uptr I = EntryLists[ListType].Head; I != CachedBlock::InvalidEntry; + { + ScopedLock L(Mutex); + CallsToRetrieve++; + if (EntriesCount == 0) + return {}; + u32 OptimalFitIndex = 0; + uptr MinDiff = UINTPTR_MAX; + for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) { const uptr CommitBase = Entries[I].CommitBase; const uptr CommitSize = Entries[I].CommitSize; @@ -380,48 +360,34 @@ class MapAllocatorCache { if (HeaderPos > CommitBase + CommitSize) continue; if (HeaderPos < CommitBase || - AllocPos > CommitBase + PageSize * MaxUnusedCachePages) + AllocPos > CommitBase + PageSize * MaxUnusedCachePages) { continue; - + } + Found = true; const uptr Diff = HeaderPos - CommitBase; - // immediately use a cached block if it's size is close enough to - // the requested size. + // immediately use a cached block if it's size is close enough to the + // requested size. const uptr MaxAllowedFragmentedBytes = (CommitBase + CommitSize - HeaderPos) / FragmentedBytesDivisor; if (Diff <= MaxAllowedFragmentedBytes) { OptimalFitIndex = I; EntryHeaderPos = HeaderPos; - OptimalFitListType = ListType; - return true; + break; } - // keep track of the smallest cached block // that is greater than (AllocSize + HeaderSize) if (Diff > MinDiff) continue; OptimalFitIndex = I; MinDiff = Diff; - OptimalFitListType = ListType; EntryHeaderPos = HeaderPos; } - return (OptimalFitIndex != CachedBlock::InvalidEntry); - }; - - { - ScopedLock L(Mutex); - CallsToRetrieve++; - if (EntriesCount == 0) - return {}; - - // Prioritize valid fit from COMMITTED entries over - // optimal fit from DECOMMITTED entries - if (!FindAvailableEntry(COMMITTED) && !FindAvailableEntry(DECOMMITTED)) - return {}; - - Entry = Entries[OptimalFitIndex]; - remove(OptimalFitIndex, OptimalFitListType); - SuccessfulRetrieves++; - } // ScopedLock L(Mutex); + if (Found) { + Entry = Entries[OptimalFitIndex]; + remove(OptimalFitIndex); + SuccessfulRetrieves++; + } + } return Entry; } @@ -466,15 +432,10 @@ class MapAllocatorCache { Quarantine[I].invalidate(); } } - auto disableLists = [&](EntryListT EntryList) REQUIRES(Mutex) { - for (u32 I = EntryLists[EntryList].Head; I != CachedBlock::InvalidEntry; - I = Entries[I].Next) { - Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase, - Entries[I].CommitSize, 0); - } - }; - disableLists(COMMITTED); - disableLists(DECOMMITTED); + for (u32 I = LRUHead; I != CachedBlock::InvalidEntry; I = Entries[I].Next) { + Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase, + Entries[I].CommitSize, 0); + } QuarantinePos = -1U; } @@ -489,7 +450,7 @@ class MapAllocatorCache { return (EntriesCount >= atomic_load_relaxed(&MaxEntriesCount)); } - void insert(const CachedBlock &Entry, EntryListT ListType) REQUIRES(Mutex) { + void insert(const CachedBlock &Entry) REQUIRES(Mutex) { DCHECK_LT(EntriesCount, atomic_load_relaxed(&MaxEntriesCount)); // Cache should be populated with valid entries when not empty @@ -498,86 +459,66 @@ class MapAllocatorCache { u32 FreeIndex = AvailableHead; AvailableHead = Entries[AvailableHead].Next; + if (EntriesCount == 0) { + LRUTail = static_cast(FreeIndex); + } else { + // Check list order + if (EntriesCount > 1) + DCHECK_GE(Entries[LRUHead].Time, Entries[Entries[LRUHead].Next].Time); + Entries[LRUHead].Prev = static_cast(FreeIndex); + } + Entries[FreeIndex] = Entry; - pushFront(FreeIndex, ListType); + Entries[FreeIndex].Next = LRUHead; + Entries[FreeIndex].Prev = CachedBlock::InvalidEntry; + LRUHead = static_cast(FreeIndex); EntriesCount++; - if (Entries[EntryLists[ListType].Head].Next != CachedBlock::InvalidEntry) { - DCHECK_GE(Entries[EntryLists[ListType].Head].Time, - Entries[Entries[EntryLists[ListType].Head].Next].Time); - } // Availability stack should not have available entries when all entries // are in use if (EntriesCount == Config::getEntriesArraySize()) DCHECK_EQ(AvailableHead, CachedBlock::InvalidEntry); } - // Joins the entries adjacent to Entries[I], effectively - // unlinking Entries[I] from the list - void unlink(uptr I, EntryListT ListType) REQUIRES(Mutex) { - if (I == EntryLists[ListType].Head) - EntryLists[ListType].Head = Entries[I].Next; + void remove(uptr I) REQUIRES(Mutex) { + DCHECK(Entries[I].isValid()); + + Entries[I].invalidate(); + + if (I == LRUHead) + LRUHead = Entries[I].Next; else Entries[Entries[I].Prev].Next = Entries[I].Next; - if (I == EntryLists[ListType].Tail) - EntryLists[ListType].Tail = Entries[I].Prev; + if (I == LRUTail) + LRUTail = Entries[I].Prev; else Entries[Entries[I].Next].Prev = Entries[I].Prev; - } - - // Invalidates Entries[I], removes Entries[I] from list, and pushes - // Entries[I] onto the stack of available entries - void remove(uptr I, EntryListT ListType) REQUIRES(Mutex) { - DCHECK(Entries[I].isValid()); - - Entries[I].invalidate(); - unlink(I, ListType); Entries[I].Next = AvailableHead; AvailableHead = static_cast(I); EntriesCount--; // Cache should not have valid entries when not empty if (EntriesCount == 0) { - DCHECK_EQ(EntryLists[COMMITTED].Head, CachedBlock::InvalidEntry); - DCHECK_EQ(EntryLists[COMMITTED].Tail, CachedBlock::InvalidEntry); - DCHECK_EQ(EntryLists[DECOMMITTED].Head, CachedBlock::InvalidEntry); - DCHECK_EQ(EntryLists[DECOMMITTED].Tail, CachedBlock::InvalidEntry); + DCHECK_EQ(LRUHead, CachedBlock::InvalidEntry); + DCHECK_EQ(LRUTail, CachedBlock::InvalidEntry); } } - inline void pushFront(uptr I, EntryListT ListType) REQUIRES(Mutex) { - if (EntryLists[ListType].Tail == CachedBlock::InvalidEntry) - EntryLists[ListType].Tail = static_cast(I); - else - Entries[EntryLists[ListType].Head].Prev = static_cast(I); - - Entries[I].Next = EntryLists[ListType].Head; - Entries[I].Prev = CachedBlock::InvalidEntry; - EntryLists[ListType].Head = static_cast(I); - } - void empty() { MemMapT MapInfo[Config::getEntriesArraySize()]; uptr N = 0; { ScopedLock L(Mutex); - auto emptyList = [&](EntryListT ListType) REQUIRES(Mutex) { - for (uptr I = EntryLists[ListType].Head; - I != CachedBlock::InvalidEntry;) { - uptr ToRemove = I; - I = Entries[I].Next; - MapInfo[N] = Entries[ToRemove].MemMap; - remove(ToRemove, ListType); - N++; - } - }; - emptyList(COMMITTED); - emptyList(DECOMMITTED); + for (uptr I = 0; I < Config::getEntriesArraySize(); I++) { + if (!Entries[I].isValid()) + continue; + MapInfo[N] = Entries[I].MemMap; + remove(I); + N++; + } EntriesCount = 0; - for (uptr I = 0; I < Config::getEntriesArraySize(); I++) - DCHECK(!Entries[I].isValid()); } for (uptr I = 0; I < N; I++) { MemMapT &MemMap = MapInfo[I]; @@ -604,14 +545,8 @@ class MapAllocatorCache { OldestTime = 0; for (uptr I = 0; I < Config::getQuarantineSize(); I++) releaseIfOlderThan(Quarantine[I], Time); - for (u16 I = EntryLists[COMMITTED].Head; I != CachedBlock::InvalidEntry; - I = Entries[I].Next) { - if (Entries[I].Time && Entries[I].Time <= Time) { - unlink(I, COMMITTED); - pushFront(I, DECOMMITTED); - } + for (uptr I = 0; I < Config::getEntriesArraySize(); I++) releaseIfOlderThan(Entries[I], Time); - } } HybridMutex Mutex; @@ -628,12 +563,10 @@ class MapAllocatorCache { NonZeroLengthArray Quarantine GUARDED_BY(Mutex) = {}; - // EntryLists stores the head and tail indices of all - // lists being used to store valid cache entries. - // Currently there are lists storing COMMITTED and DECOMMITTED entries. - // COMMITTED entries have memory chunks that have not been released to the OS - // DECOMMITTED entries have memory chunks that have been released to the OS - ListInfo EntryLists[2] GUARDED_BY(Mutex) = {}; + // The LRUHead of the cache is the most recently used cache entry + u16 LRUHead GUARDED_BY(Mutex) = 0; + // The LRUTail of the cache is the least recently used cache entry + u16 LRUTail GUARDED_BY(Mutex) = 0; // The AvailableHead is the top of the stack of available entries u16 AvailableHead GUARDED_BY(Mutex) = 0; }; @@ -773,7 +706,6 @@ MapAllocator::tryAllocateFromCache(const Options &Options, uptr Size, } return Ptr; } - // As with the Primary, the size passed to this function includes any desired // alignment, so that the frontend can align the user allocation. The hint // parameter allows us to unmap spurious memory when dealing with larger diff --git a/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_uas.cpp b/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_uas.cpp index 62caf1bd25fb03..6a7d2e9228e639 100644 --- a/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_uas.cpp +++ b/compiler-rt/test/hwasan/TestCases/hwasan_symbolize_stack_uas.cpp @@ -1,5 +1,6 @@ -// RUN: %clang_hwasan -Wl,--build-id -g %s -o %t -// RUN: %env_hwasan_opts=symbolize=0 not %run %t 2>&1 | hwasan_symbolize --symbols $(dirname %t) --index | FileCheck %s +// RUN: rm -rf %t; mkdir %t +// RUN: %clang_hwasan -Wl,--build-id -g %s -o %t/hwasan_uas +// RUN: %env_hwasan_opts=symbolize=0 not %run %t/hwasan_uas 2>&1 | hwasan_symbolize --symbols %t --index | FileCheck %s // This doesn't work on X86, because that uses instrument-with-calls which // disables frame records. diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp index d5d81280e0b44c..d0be7f4fa87899 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp @@ -1,5 +1,7 @@ // RUN: %clangxx %s -o %t && %run %t %p +// UNSUPPORTED: android + #include #include #include @@ -25,20 +27,18 @@ #endif int main() { - int res; res = prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, 0, 0, 0); if (res < 0) { assert(errno == EINVAL || errno == ENODEV); - return 0; - } - - uint64_t cookie = 0; - res = prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, 0, 0, &cookie); - if (res < 0) { - assert(errno == EINVAL); } else { - assert(cookie != 0); + uint64_t cookie = 0; + res = prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, 0, 0, &cookie); + if (res < 0) { + assert(errno == EINVAL); + } else { + assert(cookie != 0); + } } char invname[81], vlname[] = "prctl"; diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h index 91b2025176770d..c820b83834de68 100644 --- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h +++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h @@ -107,6 +107,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern { mlir::Value box, mlir::ConversionPatternRewriter &rewriter) const; + mlir::Value getExtraFromBox(mlir::Location loc, TypePair boxTy, + mlir::Value box, + mlir::ConversionPatternRewriter &rewriter) const; + // Get the element type given an LLVM type that is of the form // (array|struct|vector)+ and the provided indexes. mlir::Type getBoxEleTy(mlir::Type type, diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index 48cc1cbc645684..04a5dd323e5508 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -46,6 +46,7 @@ namespace fir::support { inline void registerNonCodegenDialects(mlir::DialectRegistry ®istry) { registry.insert(); mlir::func::registerInlinerExtension(registry); + mlir::LLVM::registerInlinerInterface(registry); } /// Register all the dialects used by flang. diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index f0bfc1548e6458..4527c9f18fa054 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -13,11 +13,10 @@ #include "flang/Runtime/entry-names.h" #define CUDA_REPORT_IF_ERROR(expr) \ - [](CUresult result) { \ - if (!result) \ + [](cudaError_t err) { \ + if (err == cudaSuccess) \ return; \ - const char *name = nullptr; \ - cuGetErrorName(result, &name); \ + const char *name = cudaGetErrorName(err); \ if (!name) \ name = ""; \ Terminator terminator{__FILE__, __LINE__}; \ diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 95c55805dcabb3..64b581e8910d07 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1371,6 +1371,9 @@ genLoopNestOp(lower::AbstractConverter &converter, lower::SymMap &symTable, llvm::ArrayRef wrapperSyms, llvm::ArrayRef wrapperArgs, llvm::omp::Directive directive, DataSharingProcessor &dsp) { + assert(wrapperSyms.size() == wrapperArgs.size() && + "Number of symbols and wrapper block arguments must match"); + auto ivCallback = [&](mlir::Operation *op) { genLoopVars(op, converter, loc, iv, wrapperSyms, wrapperArgs); return llvm::SmallVector(iv); @@ -2083,8 +2086,6 @@ static void genCompositeDistributeSimd( llvm::concat(distributeOp.getRegion().getArguments(), simdOp.getRegion().getArguments())); - assert(wrapperArgs.empty() && - "Block args for omp.simd and omp.distribute currently not expected"); genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, /*wrapperSyms=*/{}, wrapperArgs, llvm::omp::Directive::OMPD_distribute_simd, dsp); @@ -2132,8 +2133,6 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, auto wrapperArgs = llvm::to_vector(llvm::concat( wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments())); - assert(wsloopReductionSyms.size() == wrapperArgs.size() && - "Number of symbols and wrapper block arguments must match"); genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, wsloopReductionSyms, wrapperArgs, llvm::omp::Directive::OMPD_do_simd, dsp); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 7934f1fdad2a0d..1713cf98a8b961 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1227,7 +1227,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion { mlir::ConversionPatternRewriter &rewriter, unsigned rank, mlir::Value eleSize, mlir::Value cfiTy, mlir::Value typeDesc, - int allocatorIdx = kDefaultAllocator) const { + int allocatorIdx = kDefaultAllocator, + mlir::Value extraField = {}) const { auto llvmBoxTy = this->lowerTy().convertBoxTypeAsStruct(boxTy, rank); bool isUnlimitedPolymorphic = fir::isUnlimitedPolymorphicType(boxTy); bool useInputType = fir::isPolymorphicType(boxTy) || isUnlimitedPolymorphic; @@ -1246,16 +1247,40 @@ struct EmboxCommonConversion : public fir::FIROpConversion { const bool hasAddendum = fir::boxHasAddendum(boxTy); - // Descriptor used to set the correct value of the extra field. - Fortran::runtime::StaticDescriptor<0> staticDescriptor; - Fortran::runtime::Descriptor &desc{staticDescriptor.descriptor()}; - desc.raw().extra = 0; - desc.SetAllocIdx(allocatorIdx); - if (hasAddendum) - desc.SetHasAddendum(); - descriptor = - insertField(rewriter, loc, descriptor, {kExtraPosInBox}, - this->genI32Constant(loc, rewriter, desc.raw().extra)); + if (extraField) { + // Make sure to set the addendum presence flag according to the + // destination box. + if (hasAddendum) { + auto maskAttr = mlir::IntegerAttr::get( + rewriter.getIntegerType(8, /*isSigned=*/false), + llvm::APInt(8, (uint64_t)_CFI_ADDENDUM_FLAG, /*isSigned=*/false)); + mlir::LLVM::ConstantOp mask = rewriter.create( + loc, rewriter.getI8Type(), maskAttr); + extraField = rewriter.create(loc, extraField, mask); + } else { + auto maskAttr = mlir::IntegerAttr::get( + rewriter.getIntegerType(8, /*isSigned=*/false), + llvm::APInt(8, (uint64_t)~_CFI_ADDENDUM_FLAG, /*isSigned=*/false)); + mlir::LLVM::ConstantOp mask = rewriter.create( + loc, rewriter.getI8Type(), maskAttr); + extraField = rewriter.create(loc, extraField, mask); + } + // Extra field value is provided so just use it. + descriptor = + insertField(rewriter, loc, descriptor, {kExtraPosInBox}, extraField); + } else { + // Compute the value of the extra field based on allocator_idx and + // addendum present using a Descriptor object. + Fortran::runtime::StaticDescriptor<0> staticDescriptor; + Fortran::runtime::Descriptor &desc{staticDescriptor.descriptor()}; + desc.raw().extra = 0; + desc.SetAllocIdx(allocatorIdx); + if (hasAddendum) + desc.SetHasAddendum(); + descriptor = + insertField(rewriter, loc, descriptor, {kExtraPosInBox}, + this->genI32Constant(loc, rewriter, desc.raw().extra)); + } if (hasAddendum) { unsigned typeDescFieldId = getTypeDescFieldId(boxTy); @@ -1325,6 +1350,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion { loc, rewriter, useInputType ? inputType : boxTy.getEleTy(), typeparams); mlir::Value typeDesc; + mlir::Value extraField; // When emboxing to a polymorphic box, get the type descriptor, type code // and element size from the source box if any. if (fir::isPolymorphicType(boxTy) && sourceBox) { @@ -1336,11 +1362,13 @@ struct EmboxCommonConversion : public fir::FIROpConversion { sourceBox, rewriter); cfiTy = this->getValueFromBox(loc, sourceBoxTyPair, sourceBox, cfiTy.getType(), rewriter, kTypePosInBox); + extraField = + this->getExtraFromBox(loc, sourceBoxTyPair, sourceBox, rewriter); } auto mod = box->template getParentOfType(); mlir::Value descriptor = populateDescriptor(loc, mod, boxTy, inputType, rewriter, rank, eleSize, - cfiTy, typeDesc, allocatorIdx); + cfiTy, typeDesc, allocatorIdx, extraField); return {boxTy, descriptor, eleSize}; } @@ -1380,10 +1408,14 @@ struct EmboxCommonConversion : public fir::FIROpConversion { rewriter); } + mlir::Value extraField = + this->getExtraFromBox(loc, inputBoxTyPair, loweredBox, rewriter); + auto mod = box->template getParentOfType(); mlir::Value descriptor = populateDescriptor(loc, mod, boxTy, box.getBox().getType(), rewriter, - rank, eleSize, cfiTy, typeDesc); + rank, eleSize, cfiTy, typeDesc, + /*allocatorIdx=*/kDefaultAllocator, extraField); return {boxTy, descriptor, eleSize}; } diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp index 5db9d8ac528ebf..12021deb4bd97a 100644 --- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp +++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp @@ -195,6 +195,14 @@ mlir::Value ConvertFIRToLLVMPattern::getRankFromBox( return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kRankPosInBox); } +/// Read the extra field from a fir.box. +mlir::Value ConvertFIRToLLVMPattern::getExtraFromBox( + mlir::Location loc, TypePair boxTy, mlir::Value box, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resultTy = getBoxEleTy(boxTy.llvm, {kExtraPosInBox}); + return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kExtraPosInBox); +} + // Get the element type given an LLVM type that is of the form // (array|struct|vector)+ and the provided indexes. mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy( diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index 88243536139e46..c9a20ebcc82e07 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -7,14 +7,20 @@ #===------------------------------------------------------------------------===# include_directories(${CUDAToolkit_INCLUDE_DIRS}) -find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) add_flang_library(CufRuntime allocator.cpp descriptor.cpp ) + +if (BUILD_SHARED_LIBS) + set(CUDA_RT_TARGET CUDA::cudart) +else() + set(CUDA_RT_TARGET CUDA::cudart_static) +endif() + target_link_libraries(CufRuntime PRIVATE FortranRuntime - ${CUDA_RUNTIME_LIBRARY} + ${CUDA_RT_TARGET} ) diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp index bd657b800c61e8..d4a473d58e86cd 100644 --- a/flang/runtime/CUDA/allocator.cpp +++ b/flang/runtime/CUDA/allocator.cpp @@ -15,7 +15,7 @@ #include "flang/ISO_Fortran_binding_wrapper.h" #include "flang/Runtime/allocator-registry.h" -#include "cuda.h" +#include "cuda_runtime.h" namespace Fortran::runtime::cuda { extern "C" { @@ -34,32 +34,28 @@ void RTDEF(CUFRegisterAllocator)() { void *CUFAllocPinned(std::size_t sizeInBytes) { void *p; - CUDA_REPORT_IF_ERROR(cuMemAllocHost(&p, sizeInBytes)); + CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes)); return p; } -void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cuMemFreeHost(p)); } +void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } void *CUFAllocDevice(std::size_t sizeInBytes) { - CUdeviceptr p = 0; - CUDA_REPORT_IF_ERROR(cuMemAlloc(&p, sizeInBytes)); - return reinterpret_cast(p); + void *p; + CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + return p; } -void CUFFreeDevice(void *p) { - CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(p))); -} +void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } void *CUFAllocManaged(std::size_t sizeInBytes) { - CUdeviceptr p = 0; + void *p; CUDA_REPORT_IF_ERROR( - cuMemAllocManaged(&p, sizeInBytes, CU_MEM_ATTACH_GLOBAL)); + cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); return reinterpret_cast(p); } -void CUFFreeManaged(void *p) { - CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(p))); -} +void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } void *CUFAllocUnified(std::size_t sizeInBytes) { // Call alloc managed for the time being. diff --git a/flang/test/Driver/fveclib-codegen.f90 b/flang/test/Driver/fveclib-codegen.f90 index 8d7d3af1e8f9ba..3a96c29ac70854 100644 --- a/flang/test/Driver/fveclib-codegen.f90 +++ b/flang/test/Driver/fveclib-codegen.f90 @@ -1,6 +1,7 @@ ! test that -fveclib= is passed to the backend ! -target aarch64 so that ArmPL is available -! RUN: %flang -S -Ofast -fveclib=LIBMVEC -o - %s | FileCheck %s +! RUN: %if aarch64-registered-target %{ %flang -S -Ofast -target aarch64-unknown-linux-gnu -fveclib=LIBMVEC -o - %s | FileCheck %s %} +! RUN: %if x86-registered-target %{ %flang -S -Ofast -target x86_64-unknown-linux-gnu -fveclib=LIBMVEC -o - %s | FileCheck %s %} ! RUN: %flang -S -Ofast -fveclib=NoLibrary -o - %s | FileCheck %s --check-prefix=NOLIB subroutine sb(a, b) diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index c5a62cabe1f49d..194a11456f2569 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -2368,6 +2368,8 @@ func.func @test_rebox_1(%arg0: !fir.box>) { //CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr //CHECK: %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1] //CHECK: %[[ELEM_SIZE_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64 +//CHECK: %[[EXTRA_GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 6] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +//CHECK: %[[EXTRA:.*]] = llvm.load %[[EXTRA_GEP]] : !llvm.ptr -> i8 //CHECK: %[[RBOX:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[RBOX_TMP1:.*]] = llvm.insertvalue %[[ELEM_SIZE_I64]], %[[RBOX]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[CFI_VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32 @@ -2380,9 +2382,9 @@ func.func @test_rebox_1(%arg0: !fir.box>) { //CHECK: %[[OTHER_ATTR:.*]] = llvm.mlir.constant(0 : i32) : i32 //CHECK: %[[OTHER_ATTR_I8:.*]] = llvm.trunc %[[OTHER_ATTR]] : i32 to i8 //CHECK: %[[RBOX_TMP5:.*]] = llvm.insertvalue %[[OTHER_ATTR_I8]], %[[RBOX_TMP4]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> -//CHECK: %[[ADDENDUM:.*]] = llvm.mlir.constant(0 : i32) : i32 -//CHECK: %[[ADDENDUM_I8:.*]] = llvm.trunc %[[ADDENDUM]] : i32 to i8 -//CHECK: %[[RBOX_TMP6:.*]] = llvm.insertvalue %[[ADDENDUM_I8]], %[[RBOX_TMP5]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +//CHECK: %[[MASK:.*]] = llvm.mlir.constant(254 : ui8) : i8 +//CHECK: %[[EXTRA_WITH_ADDENDUM_CORRECTION:.*]] = llvm.and %[[EXTRA]], %[[MASK]] : i8 +//CHECK: %[[RBOX_TMP6:.*]] = llvm.insertvalue %[[EXTRA_WITH_ADDENDUM_CORRECTION]], %[[RBOX_TMP5]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[DIM1_STRIDE_REF:.*]] = llvm.getelementptr %[[ARG0]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> //CHECK: %[[DIM1_STRIDE:.*]] = llvm.load %[[DIM1_STRIDE_REF]] : !llvm.ptr -> i64 //CHECK: %[[DIM2_STRIDE_REF:.*]] = llvm.getelementptr %[[ARG0]][0, 7, 1, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> @@ -2442,6 +2444,8 @@ func.func @foo(%arg0: !fir.box} //CHECK: %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1] //CHECK: %[[CHAR_SIZE:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64 //CHECK: %[[ELEM_SIZE:.*]] = llvm.mul %[[CHAR_SIZE]], %[[ELEM_COUNT]] +//CHECK: %[[EXTRA_GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 6] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> +//CHECK: %[[EXTRA:.*]] = llvm.load %[[EXTRA_GEP]] : !llvm.ptr -> i8 //CHECK: %[[RBOX_TMP1:.*]] = llvm.insertvalue %[[ELEM_SIZE]], %{{.*}}[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[RBOX_TMP2:.*]] = llvm.insertvalue %{{.*}}, %[[RBOX_TMP1]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[RANK:.*]] = llvm.mlir.constant(1 : i32) : i32 @@ -2450,9 +2454,9 @@ func.func @foo(%arg0: !fir.box} //CHECK: %[[TYPE_CHAR_I8:.*]] = llvm.trunc %[[TYPE_CHAR]] : i32 to i8 //CHECK: %[[RBOX_TMP4:.*]] = llvm.insertvalue %[[TYPE_CHAR_I8]], %[[RBOX_TMP3]][4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[RBOX_TMP5:.*]] = llvm.insertvalue %{{.*}}, %[[RBOX_TMP4]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> -//CHECK: %[[ADDENDUM:.*]] = llvm.mlir.constant(0 : i32) : i32 -//CHECK: %[[ADDENDUM_I8:.*]] = llvm.trunc %[[ADDENDUM]] : i32 to i8 -//CHECK: %[[RBOX_TMP6:.*]] = llvm.insertvalue %[[ADDENDUM_I8]], %[[RBOX_TMP5]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> +//CHECK: %[[MASK:.*]] = llvm.mlir.constant(254 : ui8) : i8 +//CHECK: %[[EXTRA_WITH_ADDENDUM_CORRECTION:.*]] = llvm.and %[[EXTRA]], %[[MASK]] : i8 +//CHECK: %[[RBOX_TMP6:.*]] = llvm.insertvalue %[[EXTRA_WITH_ADDENDUM_CORRECTION]], %[[RBOX_TMP5]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> //CHECK: %[[SRC_STRIDE_PTR:.*]] = llvm.getelementptr %[[ARG0]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> //CHECK: %[[SRC_STRIDE:.*]] = llvm.load %[[SRC_STRIDE_PTR]] : !llvm.ptr -> i64 //CHECK: %[[SRC_ARRAY_PTR:.*]] = llvm.getelementptr %[[ARG0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir index d0393eadef58bd..d6aa84353b761b 100644 --- a/flang/test/Fir/rebox.fir +++ b/flang/test/Fir/rebox.fir @@ -22,8 +22,12 @@ func.func @test_rebox_1(%arg0: !fir.box>) { %0 = fir.slice %c5, %undef, %undef, %c6, %c80, %c3 : (index, index, index, index, index, index) -> !fir.slice<2> %1 = fir.shift %c3, %c4 : (index, index) -> !fir.shift<2> + // CHECK: %[[EXTRA_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 6 + // CHECK: %[[EXTRA:.*]] = load i8, ptr %[[EXTRA_GEP]] + // CHECK: %[[EXTRA_WITH_ADDENDUM_CORRECTION:.*]] = and i8 %[[EXTRA]] + // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 undef, [1 x [3 x i64]] undef }, i8 %[[EXTRA_WITH_ADDENDUM_CORRECTION]], 6 // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 2 - // CHECK: %[[INSTRIDE_0:.]] = load i64, ptr %[[INSTRIDE_0_GEP]] + // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]] // CHECK: %[[INSTRIDE_1_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 1, i32 2 // CHECK: %[[INSTRIDE_1:.*]] = load i64, ptr %[[INSTRIDE_1_GEP]] // CHECK: %[[INBASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0 @@ -33,9 +37,9 @@ func.func @test_rebox_1(%arg0: !fir.box>) { // CHECK: %[[OFFSET_1:.*]] = mul i64 2, %[[INSTRIDE_1]] // CHECK: %[[VOIDBASE1:.*]] = getelementptr i8, ptr %[[VOIDBASE0]], i64 %[[OFFSET_1]] // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 3, %[[INSTRIDE_1]] - // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 {{.*}}, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 25, i64 undef]] }, i64 %[[OUTSTRIDE0]], 7, 0, 2 - // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX0]], ptr %[[VOIDBASE1]], 0 - // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX1]], ptr %[[OUTBOX_ALLOC]], align 8 + // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[OUTSTRIDE0]], 7, 0, 2 + // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX1]], ptr %[[VOIDBASE1]], 0 + // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX2]], ptr %[[OUTBOX_ALLOC]], align 8 %2 = fir.rebox %arg0(%1) [%0] : (!fir.box>, !fir.shift<2>, !fir.slice<2>) -> !fir.box> // CHECK: call void @bar1(ptr %[[OUTBOX_ALLOC]]) fir.call @bar1(%2) : (!fir.box>) -> () @@ -92,7 +96,7 @@ func.func @test_rebox_3(%arg0: !fir.box>) { // CHECK: %[[INBASE:.*]] = load ptr, ptr %[[INBASE_GEP]] // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 3, %[[INSTRIDE]] // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 4, %[[OUTSTRIDE1]] - // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 {{.*}}, i8 3, i8 27, i8 0, i8 0, [3 x [3 x i64]] [{{.*}} [i64 2, i64 3, i64 undef], [3 x i64] undef, [3 x i64] undef] }, i64 %[[INSTRIDE]], 7, 0, 2 + // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %{{.*}}, i64 %[[INSTRIDE]], 7, 0, 2 // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX0]], i64 3, 7, 1, 0 // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX1]], i64 4, 7, 1, 1 // CHECK: %[[OUTBOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX2]], i64 %[[OUTSTRIDE1]], 7, 1, 2 @@ -121,7 +125,7 @@ func.func @test_rebox_4(%arg0: !fir.box>>) { // CHECK: %[[STRIDE:.*]] = load i64, ptr %[[STRIDE_GEP]] // CHECK: %[[BASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INPUT]], i32 0, i32 0 // CHECK: %[[BASE:.*]] = load ptr, ptr %[[BASE_GEP]] - // CHECK: %[[NEWBOX1:.*]] = insertvalue {{{.*}}} { ptr undef, i64 ptrtoint (ptr getelementptr ([10 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 1, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 undef, i64 undef]] }, i64 %[[EXTENT]], 7, 0, 1 + // CHECK: %[[NEWBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[EXTENT]], 7, 0, 1 // CHECK: %[[NEWBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX1]], i64 %[[STRIDE]], 7, 0, 2 // CHECK: %[[NEWBOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX2]], ptr %[[BASE]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[NEWBOX3]], ptr %[[NEWBOX_STORAGE]] @@ -164,7 +168,7 @@ func.func @test_cmplx_1(%arg0: !fir.box>>) { // CHECK: %[[CHECK_NONZERO:.*]] = icmp sgt i64 %[[DIV_1]], 0 // CHECK: %[[CHECKED_BOUND:.*]] = select i1 %[[CHECK_NONZERO]], i64 %[[DIV_1]], i64 0 // CHECK: %[[STRIDE:.*]] = mul i64 1, %[[INSTRIDE_1]] - // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 {{.*}}, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 undef, i64 undef]] }, i64 %[[CHECKED_BOUND]], 7, 0, 1 + // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[CHECKED_BOUND]], 7, 0, 1 // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], i64 %[[STRIDE]], 7, 0, 2 // CHECK: %[[VAL_BUILD_3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OFFSET_GEP]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_3]], ptr %[[OUTBOX_ALLOC]] @@ -198,7 +202,7 @@ func.func @test_cmplx_2(%arg0: !fir.box>>) { // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 6, %[[INSTRIDE_0]] // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]] // CHECK: %[[STRIDE:.*]] = mul i64 5, %[[INSTRIDE_0]] - // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 {{.*}}, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 11, i64 undef]] }, i64 %[[STRIDE]], 7, 0, 2 + // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[STRIDE]], 7, 0, 2 // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], ptr %[[OFFSET_GEP]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OUTBOX_ALLOC]] fir.call @bar1(%2) fastmath : (!fir.box>) -> () diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir index e649c06731c6ba..12232a29aae4aa 100644 --- a/flang/test/Fir/tbaa-codegen2.fir +++ b/flang/test/Fir/tbaa-codegen2.fir @@ -74,7 +74,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ // CHECK: %[[VAL11:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 0 // box access: // CHECK: %[[VAL12:.*]] = load ptr, ptr %[[VAL11]], align 8, !tbaa ![[BOX_ACCESS_TAG]] -// CHECK: %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %14, ptr %[[VAL12]], 0 +// CHECK: %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %[[VAL12]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL15]], ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]] // CHECK: %[[VAL16:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 0 // box access: diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir index 14ee3b723bafb4..809ab3a922a0fe 100644 --- a/flang/test/Fir/tbaa.fir +++ b/flang/test/Fir/tbaa.fir @@ -53,6 +53,8 @@ module { // CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_0]][0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8 // CHECK-NEXT: %[[VAL_23_EXT:.*]] = llvm.sext %[[VAL_23]] : i8 to i32 +// CHECK: %[[EXTRA_GEP:.*]] = llvm.getelementptr %[[VAL_0]][0, 6] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> +// CHECK: %[[EXTRA:.*]] = llvm.load %[[EXTRA_GEP]] {tbaa = [#tbaa_tag]} : !llvm.ptr -> i8 // CHECK: %[[VAL_24:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> // CHECK: %[[VAL_25:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_24]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> // CHECK: %[[VAL_26:.*]] = llvm.mlir.constant(20240719 : i32) : i32 @@ -65,8 +67,8 @@ module { // CHECK: %[[VAL_33:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i32 to i8 // CHECK: %[[VAL_35:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_32]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> -// CHECK: %[[VAL_36:.*]] = llvm.mlir.constant(1 : i32) : i32 -// CHECK: %[[VAL_37:.*]] = llvm.trunc %[[VAL_36]] : i32 to i8 +// CHECK: %[[VAL_36:.*]] = llvm.mlir.constant(1 : ui8) : i8 +// CHECK: %[[VAL_37:.*]] = llvm.or %[[EXTRA]], %[[VAL_36]] : i8 // CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_35]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> // CHECK: %[[VAL_40B:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_38]][7] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> // CHECK: %[[VAL_40:.*]] = llvm.insertvalue %{{.*}}, %[[VAL_40B]][8, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> @@ -166,9 +168,7 @@ module { // CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_38:.*]] = llvm.trunc %[[VAL_37]] : i32 to i8 // CHECK: %[[VAL_39:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_36]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> -// CHECK: %[[VAL_40:.*]] = llvm.mlir.constant(1 : i32) : i32 -// CHECK: %[[VAL_41:.*]] = llvm.trunc %[[VAL_40]] : i32 to i8 -// CHECK: %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_39]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> +// CHECK: %[[VAL_42:.*]] = llvm.insertvalue %{{.*}}, %[[VAL_39]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_44B:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_42]][8] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_44:.*]] = llvm.insertvalue %{{.*}}, %[[VAL_44B]][9, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_45:.*]] = llvm.getelementptr %[[VAL_3]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index e7d2dcd0c20b79..8fe06450d6119e 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -651,7 +651,7 @@ program test_alloc ! LLVM: %[[TYPE_CODE_TRUNC:.*]] = trunc i32 %[[TYPE_CODE_EXT]] to i8 ! LLVM: %[[BOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX2]], i8 %[[TYPE_CODE_TRUNC]], 4 ! LLVM: %[[BOX4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX3]], i8 0, 5 -! LLVM: %[[BOX5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX4]], i8 1, 6 +! LLVM: %[[BOX5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX4]], i8 %{{.*}}, 6 ! LLVM: %[[BOX6A:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX5]], ptr %[[TDESC_C3]], 7 ! LLVM: %[[BOX6:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX6A]], i64 0, 8, 0 ! LLVM: %[[BOX7:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX6]], ptr %{{.*}}, 0 @@ -673,7 +673,7 @@ program test_alloc ! LLVM: %[[TYPE_CODE_TRUNC:.*]] = trunc i32 %[[TYPE_CODE_EXT]] to i8 ! LLVM: %[[BOX3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX2]], i8 %[[TYPE_CODE_TRUNC]], 4 ! LLVM: %[[BOX4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX3]], i8 0, 5 -! LLVM: %[[BOX5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX4]], i8 1, 6 +! LLVM: %[[BOX5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX4]], i8 %{{.*}}, 6 ! LLVM: %[[BOX6A:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX5]], ptr %[[TDESC_C4]], 7 ! LLVM: %[[BOX6:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX6A]], i64 0, 8, 0 ! LLVM: %[[BOX7:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX6]], ptr %{{.*}}, 0 diff --git a/flang/test/Lower/default-initialization-globals.f90 b/flang/test/Lower/default-initialization-globals.f90 index 384d1cb763ad67..e9611dab467cba 100644 --- a/flang/test/Lower/default-initialization-globals.f90 +++ b/flang/test/Lower/default-initialization-globals.f90 @@ -1,5 +1,5 @@ ! Test default initialization of global variables (static init) -! RUN: bbc -hlfir=false %s -o - | FileCheck %s --check-prefixes=%if system-aix %{"CHECK","CHECK-BE"%} \ +! RUN: bbc -hlfir=false %s -o - | FileCheck %s --check-prefixes=%if target={{.*-aix.*|sparc.*}} %{"CHECK","CHECK-BE"%} \ ! RUN: %else %{"CHECK","CHECK-LE"%} module tinit diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp index 9f5ec289ee8f74..b51ff0ac006cc6 100644 --- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp +++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp @@ -14,7 +14,7 @@ #include "flang/Runtime/allocatable.h" #include "flang/Runtime/allocator-registry.h" -#include "cuda.h" +#include "cuda_runtime.h" using namespace Fortran::runtime; using namespace Fortran::runtime::cuda; @@ -25,38 +25,9 @@ static OwningPtr createAllocatable( CFI_attribute_allocatable); } -thread_local static int32_t defaultDevice = 0; - -CUdevice getDefaultCuDevice() { - CUdevice device; - CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); - return device; -} - -class ScopedContext { -public: - ScopedContext() { - // Static reference to CUDA primary context for device ordinal - // defaultDevice. - static CUcontext context = [] { - CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); - CUcontext ctx; - // Note: this does not affect the current context. - CUDA_REPORT_IF_ERROR( - cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice())); - return ctx; - }(); - - CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); - } - - ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } -}; - TEST(AllocatableCUFTest, SimpleDeviceAllocate) { using Fortran::common::TypeCategory; RTNAME(CUFRegisterAllocator)(); - ScopedContext ctx; // REAL(4), DEVICE, ALLOCATABLE :: a(:) auto a{createAllocatable(TypeCategory::Real, 4)}; a->SetAllocIdx(kDeviceAllocatorPos); @@ -74,7 +45,6 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) { TEST(AllocatableCUFTest, SimplePinnedAllocate) { using Fortran::common::TypeCategory; RTNAME(CUFRegisterAllocator)(); - ScopedContext ctx; // INTEGER(4), PINNED, ALLOCATABLE :: a(:) auto a{createAllocatable(TypeCategory::Integer, 4)}; EXPECT_FALSE(a->HasAddendum()); @@ -93,7 +63,6 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) { TEST(AllocatableCUFTest, DescriptorAllocationTest) { using Fortran::common::TypeCategory; RTNAME(CUFRegisterAllocator)(); - ScopedContext ctx; // REAL(4), DEVICE, ALLOCATABLE :: a(:) auto a{createAllocatable(TypeCategory::Real, 4)}; Descriptor *desc = nullptr; diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml index f8395522589123..f8b105514271c7 100644 --- a/libc/newhdrgen/yaml/math.yaml +++ b/libc/newhdrgen/yaml/math.yaml @@ -214,6 +214,20 @@ functions: arguments: - type: long double - type: long double + - name: dsqrtl + standards: + - stdc + return_type: double + arguments: + - type: long double + - name: dsqrtf128 + standards: + - llvm_libc_ext + return_type: double + arguments: + - type: float128 + - type: float128 + guard: LIBC_TYPES_HAS_FLOAT128 - name: erff standards: - stdc @@ -1195,6 +1209,25 @@ functions: - type: long double - type: int - type: unsigned int + - name: fsqrt + standards: + - stdc + return_type: float + arguments: + - type: double + - name: fsqrtl + standards: + - stdc + return_type: float + arguments: + - type: long double + - name: fsqrtf128 + standards: + - llvm_libc_ext + return_type: float + arguments: + - type: float128 + guard: LIBC_TYPES_HAS_FLOAT128 - name: fsub standards: - stdc @@ -2323,13 +2356,42 @@ functions: arguments: - type: const long double * - type: const long double * + - name: totalordermag + standards: + - stdc + return_type: int + arguments: + - type: const double * + - type: const double * + - name: totalordermagf + standards: + - stdc + return_type: int + arguments: + - type: const float * + - type: const float * + - name: totalordermagl + standards: + - stdc + return_type: int + arguments: + - type: const long double * + - type: const long double * + - name: totalordermagf128 + standards: + - stdc + return_type: int + arguments: + - type: const float128 * + - type: const float128 * + guard: LIBC_TYPES_HAS_FLOAT128 - name: totalordermagf16 standards: - stdc return_type: int arguments: - - type: _Float16 * - - type: _Float16 * + - type: const _Float16 * + - type: const _Float16 * guard: LIBC_TYPES_HAS_FLOAT16 - name: trunc standards: diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 74afb60725b7f1..8497acbfd62f0d 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -752,11 +752,11 @@ def StdC : StandardSpec<"stdc"> { GuardedFunctionSpec<"totalorderf16", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"totalorderf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, - FunctionSpec<"totalordermag", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"totalordermagf", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"totalordermagl", RetValSpec, [ArgSpec, ArgSpec]>, - GuardedFunctionSpec<"totalordermagf16", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, - GuardedFunctionSpec<"totalordermagf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, + FunctionSpec<"totalordermag", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"totalordermagf", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"totalordermagl", RetValSpec, [ArgSpec, ArgSpec]>, + GuardedFunctionSpec<"totalordermagf16", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"totalordermagf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"getpayload", RetValSpec, [ArgSpec]>, FunctionSpec<"getpayloadf", RetValSpec, [ArgSpec]>, diff --git a/libcxx/docs/Status/Cxx14Papers.csv b/libcxx/docs/Status/Cxx14Papers.csv index 7961d994bf35ee..3dc670ca0a5dc4 100644 --- a/libcxx/docs/Status/Cxx14Papers.csv +++ b/libcxx/docs/Status/Cxx14Papers.csv @@ -1,32 +1,32 @@ -"Paper #","Group","Paper Name","Meeting","Status","First released version" -"`N3346 `__","LWG","Terminology for Container Element Requirements - Rev 1","2012-02 (Kona)","|Complete|","3.4" +"Paper #","Paper Name","Meeting","Status","First released version","Labels" +"`N3346 `__","Terminology for Container Element Requirements - Rev 1","2012-02 (Kona)","|Complete|","3.4","" "","","","","","" -"`N3421 `__","LWG","Making Operator Functors greater<>","2012-10 (Portland)","|Complete|","3.4" -"`N3462 `__","LWG","std::result_of and SFINAE","2012-10 (Portland)","|Complete|","3.4" -"`N3469 `__","LWG","Constexpr Library Additions: chrono, v3","2012-10 (Portland)","|Complete|","3.4" -"`N3470 `__","LWG","Constexpr Library Additions: containers, v2","2012-10 (Portland)","|Complete|","3.4" -"`N3471 `__","LWG","Constexpr Library Additions: utilities, v3","2012-10 (Portland)","|Complete|","3.4" -"`N3302 `__","LWG","Constexpr Library Additions: complex, v2","2012-10 (Portland)","|Complete|","3.4" +"`N3421 `__","Making Operator Functors greater<>","2012-10 (Portland)","|Complete|","3.4","" +"`N3462 `__","std::result_of and SFINAE","2012-10 (Portland)","|Complete|","3.4","" +"`N3469 `__","Constexpr Library Additions: chrono, v3","2012-10 (Portland)","|Complete|","3.4","" +"`N3470 `__","Constexpr Library Additions: containers, v2","2012-10 (Portland)","|Complete|","3.4","" +"`N3471 `__","Constexpr Library Additions: utilities, v3","2012-10 (Portland)","|Complete|","3.4","" +"`N3302 `__","Constexpr Library Additions: complex, v2","2012-10 (Portland)","|Complete|","3.4","" "","","","","","" -"`N3545 `__","LWG","An Incremental Improvement to integral_constant","2013-04 (Bristol)","|Complete|","3.4" -"`N3644 `__","LWG","Null Forward Iterators","2013-04 (Bristol)","|Complete|","3.4" -"`N3668 `__","LWG","std::exchange()","2013-04 (Bristol)","|Complete|","3.4" -"`N3658 `__","LWG","Compile-time integer sequences","2013-04 (Bristol)","|Complete|","3.4" -"`N3670 `__","LWG","Addressing Tuples by Type","2013-04 (Bristol)","|Complete|","3.4" -"`N3671 `__","LWG","Making non-modifying sequence operations more robust","2013-04 (Bristol)","|Complete|","3.4" -"`N3656 `__","LWG","make_unique","2013-04 (Bristol)","|Complete|","3.4" -"`N3654 `__","LWG","Quoted Strings","2013-04 (Bristol)","|Complete|","3.4" -"`N3642 `__","LWG","User-defined Literals","2013-04 (Bristol)","|Complete|","3.4" -"`N3655 `__","LWG","TransformationTraits Redux (excluding part 4)","2013-04 (Bristol)","|Complete|","3.4" -"`N3657 `__","LWG","Adding heterogeneous comparison lookup to associative containers","2013-04 (Bristol)","|Complete|","3.4" -"`N3672 `__","LWG","A proposal to add a utility class to represent optional objects","2013-04 (Bristol)","*Removed from Draft Standard*","n/a" -"`N3669 `__","LWG","Fixing constexpr member functions without const","2013-04 (Bristol)","|Complete|","3.4" -"`N3662 `__","LWG","C++ Dynamic Arrays (dynarray)","2013-04 (Bristol)","*Removed from Draft Standard*","n/a" -"`N3659 `__","SG1","Shared Locking in C++","2013-04 (Bristol)","|Complete|","3.4" +"`N3545 `__","An Incremental Improvement to integral_constant","2013-04 (Bristol)","|Complete|","3.4","" +"`N3644 `__","Null Forward Iterators","2013-04 (Bristol)","|Complete|","3.4","" +"`N3668 `__","std::exchange()","2013-04 (Bristol)","|Complete|","3.4","" +"`N3658 `__","Compile-time integer sequences","2013-04 (Bristol)","|Complete|","3.4","" +"`N3670 `__","Addressing Tuples by Type","2013-04 (Bristol)","|Complete|","3.4","" +"`N3671 `__","Making non-modifying sequence operations more robust","2013-04 (Bristol)","|Complete|","3.4","" +"`N3656 `__","make_unique","2013-04 (Bristol)","|Complete|","3.4","" +"`N3654 `__","Quoted Strings","2013-04 (Bristol)","|Complete|","3.4","" +"`N3642 `__","User-defined Literals","2013-04 (Bristol)","|Complete|","3.4","" +"`N3655 `__","TransformationTraits Redux (excluding part 4)","2013-04 (Bristol)","|Complete|","3.4","" +"`N3657 `__","Adding heterogeneous comparison lookup to associative containers","2013-04 (Bristol)","|Complete|","3.4","" +"`N3672 `__","A proposal to add a utility class to represent optional objects","2013-04 (Bristol)","*Removed from Draft Standard*","n/a","" +"`N3669 `__","Fixing constexpr member functions without const","2013-04 (Bristol)","|Complete|","3.4","" +"`N3662 `__","C++ Dynamic Arrays (dynarray)","2013-04 (Bristol)","*Removed from Draft Standard*","n/a","" +"`N3659 `__","Shared Locking in C++","2013-04 (Bristol)","|Complete|","3.4","" "","","","","","" -"`N3779 `__","LWG","User-defined Literals for std::complex","2013-09 (Chicago)","|Complete|","3.4" -"`N3789 `__","LWG","Constexpr Library Additions: functional","2013-09 (Chicago)","|Complete|","3.4" +"`N3779 `__","User-defined Literals for std::complex","2013-09 (Chicago)","|Complete|","3.4","" +"`N3789 `__","Constexpr Library Additions: functional","2013-09 (Chicago)","|Complete|","3.4","" "","","","","","" -"`N3924 `__","LWG","Discouraging rand() in C++14","2014-02 (Issaquah)","|Complete|","3.5" -"`N3887 `__","LWG","Consistent Metafunction Aliases","2014-02 (Issaquah)","|Complete|","3.5" -"`N3891 `__","SG1","A proposal to rename shared_mutex to shared_timed_mutex","2014-02 (Issaquah)","|Complete|","3.5" +"`N3924 `__","Discouraging rand() in C++14","2014-02 (Issaquah)","|Complete|","3.5","" +"`N3887 `__","Consistent Metafunction Aliases","2014-02 (Issaquah)","|Complete|","3.5","" +"`N3891 `__","A proposal to rename shared_mutex to shared_timed_mutex","2014-02 (Issaquah)","|Complete|","3.5","" diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index a072868580696d..35e42e5ec2d7ba 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -2,7 +2,6 @@ "`LWG2016 `__","Allocators must be no-throw swappable","2014-11 (Urbana)","|Complete|","","" "`LWG2118 `__","``unique_ptr``\ for array does not support cv qualification conversion of actual argument","2014-11 (Urbana)","|Complete|","","" "`LWG2170 `__","Aggregates cannot be ``DefaultConstructible``\ ","2014-11 (Urbana)","|Complete|","","" -"`LWG2308 `__","Clarify container destructor requirements w.r.t. ``std::array``\ ","2014-11 (Urbana)","|Complete|","","" "`LWG2340 `__","Replacement allocation functions declared as inline","2014-11 (Urbana)","|Complete|","","" "`LWG2354 `__","Unnecessary copying when inserting into maps with braced-init syntax","2014-11 (Urbana)","|Complete|","","" "`LWG2377 `__","``std::align``\ requirements overly strict","2014-11 (Urbana)","|Complete|","","" @@ -224,7 +223,6 @@ "`LWG2679 `__","Inconsistent Use of Effects and Equivalent To","2016-11 (Issaquah)","|Complete|","","" "`LWG2680 `__","Add ""Equivalent to"" to filesystem","2016-11 (Issaquah)","|Complete|","","" "`LWG2681 `__","filesystem::copy() cannot copy symlinks","2016-11 (Issaquah)","|Complete|","","" -"`LWG2682 `__","filesystem::copy() won't create a symlink to a directory","2016-11 (Issaquah)","|Complete|","","" "`LWG2686 `__","Why is std::hash specialized for error_code, but not error_condition?","2016-11 (Issaquah)","|Complete|","","" "`LWG2694 `__","Application of LWG 436 accidentally deleted definition of ""facet""","2016-11 (Issaquah)","|Complete|","","" "`LWG2696 `__","Interaction between make_shared and enable_shared_from_this is underspecified","2016-11 (Issaquah)","|Nothing To Do|","","" @@ -254,7 +252,6 @@ "`LWG2760 `__","non-const basic_string::data should not invalidate iterators","2016-11 (Issaquah)","|Complete|","","" "`LWG2765 `__","Did LWG 1123 go too far?","2016-11 (Issaquah)","|Complete|","","" "`LWG2767 `__","not_fn call_wrapper can form invalid types","2016-11 (Issaquah)","|Complete|","","" -"`LWG2769 `__","Redundant const in the return type of any_cast(const any&)","2016-11 (Issaquah)","|Complete|","","" "`LWG2771 `__","Broken Effects of some basic_string::compare functions in terms of basic_string_view","2016-11 (Issaquah)","|Complete|","","" "`LWG2773 `__","Making std::ignore constexpr","2016-11 (Issaquah)","|Complete|","","" "`LWG2777 `__","basic_string_view::copy should use char_traits::copy","2016-11 (Issaquah)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index 2318804b31878c..60c6dc532dc719 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -76,6 +76,5 @@ "`LWG4105 `__","``ranges::ends_with``\`s Returns misses difference casting","2024-06 (St. Louis)","","","|ranges|" "`LWG4106 `__","``basic_format_args`` should not be default-constructible","2024-06 (St. Louis)","|Complete|","19.0","|format|" "","","","","","" -"`LWG3343 `__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Yet Adopted","|Complete|","16.0","" -"XXXX","The sys_info range should be affected by save","Not Yet Adopted","|Complete|","19.0","" +"`LWG3343 `__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16.0","" "","","","","","" diff --git a/libcxx/include/complex b/libcxx/include/complex index 22271acaf7358d..e6534025de57e5 100644 --- a/libcxx/include/complex +++ b/libcxx/include/complex @@ -421,7 +421,8 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(float __re = 0.0f, float __im = 0.0f) : __re_(__re), __im_(__im) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(__from_builtin_tag, _Complex float __v) + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex float __v) : __re_(__real__ __v), __im_(__imag__ __v) {} _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR complex(const complex& __c); @@ -517,7 +518,8 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(double __re = 0.0, double __im = 0.0) : __re_(__re), __im_(__im) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(__from_builtin_tag, _Complex double __v) + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex double __v) : __re_(__real__ __v), __im_(__imag__ __v) {} _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(const complex& __c); @@ -617,7 +619,8 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(long double __re = 0.0L, long double __im = 0.0L) : __re_(__re), __im_(__im) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(__from_builtin_tag, _Complex long double __v) + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex long double __v) : __re_(__real__ __v), __im_(__imag__ __v) {} _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(const complex& __c); diff --git a/libcxx/include/optional b/libcxx/include/optional index f9cbcbfa595d1a..41d7515a2b6892 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -301,7 +301,7 @@ struct __optional_destruct_base<_Tp, false> { # if _LIBCPP_STD_VER >= 23 template - _LIBCPP_HIDE_FROM_ABI constexpr __optional_destruct_base( + _LIBCPP_HIDE_FROM_ABI constexpr explicit __optional_destruct_base( __optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) : __val_(std::invoke(std::forward<_Fp>(__f), std::forward<_Args>(__args)...)), __engaged_(true) {} # endif @@ -707,8 +707,11 @@ public: } # if _LIBCPP_STD_VER >= 23 - template - _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(__optional_construct_from_invoke_tag, _Fp&& __f, _Args&&... __args) + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_Tag, _Fp&& __f, _Args&&... __args) : __base(__optional_construct_from_invoke_tag{}, std::forward<_Fp>(__f), std::forward<_Args>(__args)...) {} # endif diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp index 83e8a64504ae0b..986360d0368a09 100644 --- a/libcxx/src/chrono.cpp +++ b/libcxx/src/chrono.cpp @@ -31,9 +31,10 @@ # include // for gettimeofday and timeval #endif -// OpenBSD does not have a fully conformant suite of POSIX timers, but +// OpenBSD and GPU do not have a fully conformant suite of POSIX timers, but // it does have clock_gettime and CLOCK_MONOTONIC which is all we need. -#if defined(__APPLE__) || defined(__gnu_hurd__) || defined(__OpenBSD__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) +#if defined(__APPLE__) || defined(__gnu_hurd__) || defined(__OpenBSD__) || defined(__AMDGPU__) || \ + defined(__NVPTX__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) # define _LIBCPP_HAS_CLOCK_GETTIME #endif diff --git a/libcxx/src/filesystem/filesystem_clock.cpp b/libcxx/src/filesystem/filesystem_clock.cpp index e13b2853e367c1..473a54a00f013a 100644 --- a/libcxx/src/filesystem/filesystem_clock.cpp +++ b/libcxx/src/filesystem/filesystem_clock.cpp @@ -29,7 +29,8 @@ # include // for gettimeofday and timeval #endif -#if defined(__APPLE__) || defined(__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) +#if defined(__APPLE__) || defined(__gnu_hurd__) || defined(__AMDGPU__) || defined(__NVPTX__) || \ + (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) # define _LIBCPP_HAS_CLOCK_GETTIME #endif diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h index c7639f56eee26f..e83ca7be7befaf 100644 --- a/libcxx/src/include/overridable_function.h +++ b/libcxx/src/include/overridable_function.h @@ -96,7 +96,8 @@ _LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) no } _LIBCPP_END_NAMESPACE_STD -#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) +// The NVPTX linker cannot create '__start/__stop' sections. +#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) && !defined(__NVPTX__) # define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1 # define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override"))) diff --git a/libcxx/test/std/numerics/complex.number/complex.special/gh_101960_ambiguous_ctor.pass.cpp b/libcxx/test/std/numerics/complex.number/complex.special/gh_101960_ambiguous_ctor.pass.cpp new file mode 100644 index 00000000000000..bffe8764386a75 --- /dev/null +++ b/libcxx/test/std/numerics/complex.number/complex.special/gh_101960_ambiguous_ctor.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// Regression test for https://github.com/llvm/llvm-project/issues/101960 where we used to +// trigger an ambiguous constructor. + +#include +#include + +struct NastyConvertible { + template + operator T() const { + return T(0); + } +}; + +template +void test() { + NastyConvertible nasty; + std::complex x(nasty, nasty); + assert(x.real() == T(0)); + assert(x.imag() == T(0)); +} + +int main(int, char**) { + test(); + test(); + test(); + + return 0; +} diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp index 207f8e4df45413..2b97d9a5bc745b 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp @@ -14,7 +14,7 @@ // XFAIL: availability-tzdb-missing // TODO TZDB Investigate -// XFAIL: target={{armv(7|8)l-linux-gnueabihf}} +// UNSUPPORTED: target={{armv(7|8)l-linux-gnueabihf}} #include #include diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/gh_101960_internal_ctor.compile.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/gh_101960_internal_ctor.compile.pass.cpp new file mode 100644 index 00000000000000..1a1d6f52a5fec9 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/gh_101960_internal_ctor.compile.pass.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// + +// Regression test for https://github.com/llvm/llvm-project/issues/101960 where a constructor +// of std::optional that should have been private was instead publicly available. + +#include +#include + +struct NastyConvertible { + template + operator T() { + return 0; + } +}; + +using F = int(int); + +static_assert(!std::is_constructible, NastyConvertible, int(int), int>::value); diff --git a/libcxx/test/support/platform_support.h b/libcxx/test/support/platform_support.h index ba14b32e3e94d9..0d4fa63b03f107 100644 --- a/libcxx/test/support/platform_support.h +++ b/libcxx/test/support/platform_support.h @@ -40,8 +40,8 @@ # include // _mktemp_s # include // _O_EXCL, ... # include // _S_IREAD, ... -#else -# include // close +#elif __has_include() +# include // close #endif #if defined(_CS_GNU_LIBC_VERSION) @@ -55,31 +55,44 @@ extern "C" { } #endif -inline -std::string get_temp_file_name() -{ +inline std::string get_temp_file_name() { #if defined(_WIN32) - while (true) { - char Name[] = "libcxx.XXXXXX"; - if (_mktemp_s(Name, sizeof(Name)) != 0) abort(); - int fd = _open(Name, _O_RDWR | _O_CREAT | _O_EXCL, _S_IREAD | _S_IWRITE); - if (fd != -1) { - _close(fd); - return Name; - } - if (errno == EEXIST) - continue; - abort(); + while (true) { + char Name[] = "libcxx.XXXXXX"; + if (_mktemp_s(Name, sizeof(Name)) != 0) + abort(); + int fd = _open(Name, _O_RDWR | _O_CREAT | _O_EXCL, _S_IREAD | _S_IWRITE); + if (fd != -1) { + _close(fd); + return Name; } + if (errno == EEXIST) + continue; + abort(); + } +#elif !__has_include() + // Without `unistd.h` we cannot guarantee that the file is unused, however we + // can simply generate a good guess in the temporary folder and create it. + constexpr char chars[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + char Name[] = "/tmp/libcxx.XXXXXX"; + for (std::size_t i = 0; i < sizeof(Name); ++i) + if (Name[i] == 'X') + Name[i] = chars[rand() % strlen(chars)]; + FILE* file = fopen(filename, "w"); + if (!file) + abort(); + if (fclose(file) == EOF) + abort(); + return std::string(Name); #else - std::string Name = "libcxx.XXXXXX"; - int FD = mkstemp(&Name[0]); - if (FD == -1) { - perror("mkstemp"); - abort(); - } - close(FD); - return Name; + std::string Name = "libcxx.XXXXXX"; + int FD = mkstemp(&Name[0]); + if (FD == -1) { + perror("mkstemp"); + abort(); + } + close(FD); + return Name; #endif } diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 15fc5b69b52076..e96208c85d1d24 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -291,17 +291,27 @@ struct is_same { enum {value = 1}; }; // when optimizations are enabled. template inline Tp const& DoNotOptimize(Tp const& value) { - asm volatile("" : : "r,m"(value) : "memory"); - return value; + // The `m` constraint is invalid in the AMDGPU backend. +# if defined(__AMDGPU__) || defined(__NVPTX__) + asm volatile("" : : "r"(value) : "memory"); +# else + asm volatile("" : : "r,m"(value) : "memory"); +# endif + return value; } template inline Tp& DoNotOptimize(Tp& value) { -#if defined(__clang__) + // The `m` and `r` output constraint is invalid in the AMDGPU backend as well + // as i8 / i1 arguments, so we just capture the pointer instead. +# if defined(__AMDGPU__) + Tp* tmp = &value; + asm volatile("" : "+v"(tmp) : : "memory"); +# elif defined(__clang__) asm volatile("" : "+r,m"(value) : : "memory"); -#else +# else asm volatile("" : "+m,r"(value) : : "memory"); -#endif +# endif return value; } #else diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py index b44b02f5304c0a..9228fc6ed20198 100755 --- a/libcxx/utils/synchronize_csv_status_files.py +++ b/libcxx/utils/synchronize_csv_status_files.py @@ -205,6 +205,7 @@ def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]: CSV_FILES_TO_SYNC = [ 'Cxx14Issues.csv', + 'Cxx14Papers.csv', 'Cxx17Issues.csv', 'Cxx17Papers.csv', 'Cxx20Issues.csv', diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index c27ab2b67dc2b2..7d26fa9aea74ab 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1639,14 +1639,14 @@ void RelocationBaseSection::addSymbolReloc( } void RelocationBaseSection::addAddendOnlyRelocIfNonPreemptible( - RelType dynType, GotSection &sec, uint64_t offsetInSec, Symbol &sym, + RelType dynType, InputSectionBase &isec, uint64_t offsetInSec, Symbol &sym, RelType addendRelType) { // No need to write an addend to the section for preemptible symbols. if (sym.isPreemptible) - addReloc({dynType, &sec, offsetInSec, DynamicReloc::AgainstSymbol, sym, 0, + addReloc({dynType, &isec, offsetInSec, DynamicReloc::AgainstSymbol, sym, 0, R_ABS}); else - addReloc(DynamicReloc::AddendOnlyWithTargetVA, dynType, sec, offsetInSec, + addReloc(DynamicReloc::AddendOnlyWithTargetVA, dynType, isec, offsetInSec, sym, 0, R_ABS, addendRelType); } diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index d4169e1e1acaf6..43eb82cbb3e28b 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -523,7 +523,8 @@ class RelocationBaseSection : public SyntheticSection { } /// Add a dynamic relocation using the target address of \p sym as the addend /// if \p sym is non-preemptible. Otherwise add a relocation against \p sym. - void addAddendOnlyRelocIfNonPreemptible(RelType dynType, GotSection &sec, + void addAddendOnlyRelocIfNonPreemptible(RelType dynType, + InputSectionBase &isec, uint64_t offsetInSec, Symbol &sym, RelType addendRelType); template diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp index 9c056f40aa943f..b9f7592fa9c663 100644 --- a/lld/MachO/ObjC.cpp +++ b/lld/MachO/ObjC.cpp @@ -186,28 +186,6 @@ ObjcCategoryChecker::ObjcCategoryChecker() roClassLayout(target->wordSize), listHeaderLayout(target->wordSize), methodLayout(target->wordSize) {} -// \p r must point to an offset within a CStringInputSection or a -// ConcatInputSection -static StringRef getReferentString(const Reloc &r) { - if (auto *isec = r.referent.dyn_cast()) - return cast(isec)->getStringRefAtOffset(r.addend); - - auto *sym = cast(r.referent.get()); - auto *symIsec = sym->isec(); - auto symOffset = sym->value + r.addend; - - if (auto *s = dyn_cast_or_null(symIsec)) - return s->getStringRefAtOffset(symOffset); - - if (isa(symIsec)) { - auto strData = symIsec->data.slice(symOffset); - const char *pszData = reinterpret_cast(strData.data()); - return StringRef(pszData, strnlen(pszData, strData.size())); - } - - llvm_unreachable("unknown reference section in getReferentString"); -} - void ObjcCategoryChecker::parseMethods(const ConcatInputSection *methodsIsec, const Symbol *methodContainerSym, const ConcatInputSection *containerIsec, @@ -219,7 +197,7 @@ void ObjcCategoryChecker::parseMethods(const ConcatInputSection *methodsIsec, methodLayout.nameOffset) continue; - CachedHashStringRef methodName(getReferentString(r)); + CachedHashStringRef methodName(r.getReferentString()); // +load methods are special: all implementations are called by the runtime // even if they are part of the same class. Thus there is no need to check // for duplicates. @@ -251,14 +229,14 @@ void ObjcCategoryChecker::parseMethods(const ConcatInputSection *methodsIsec, ->getReferentInputSection(); nameReloc = roIsec->getRelocAt(roClassLayout.nameOffset); } - StringRef containerName = getReferentString(*nameReloc); + StringRef containerName = nameReloc->getReferentString(); StringRef methPrefix = mKind == MK_Instance ? "-" : "+"; // We should only ever encounter collisions when parsing category methods // (since the Class struct is parsed before any of its categories). assert(mcKind == MCK_Category); StringRef newCatName = - getReferentString(*containerIsec->getRelocAt(catLayout.nameOffset)); + containerIsec->getRelocAt(catLayout.nameOffset)->getReferentString(); auto formatObjAndSrcFileName = [](const InputSection *section) { lld::macho::InputFile *inputFile = section->getFile(); @@ -809,7 +787,7 @@ void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo, assert(extInfo.objFileForMergeData && "Expected to already have valid objextInfo.objFileForMergeData"); - StringRef catName = getReferentString(*catNameReloc); + StringRef catName = catNameReloc->getReferentString(); extInfo.mergedContainerName += catName.str(); // Parse base class @@ -873,7 +851,6 @@ Defined *ObjcCategoryMerger::emitAndLinkProtocolList( infoCategoryWriter.catPtrListInfo.align); listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection; listSec->live = true; - addInputSection(listSec); listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection; @@ -889,6 +866,7 @@ Defined *ObjcCategoryMerger::emitAndLinkProtocolList( ptrListSym->used = true; parentSym->getObjectFile()->symbols.push_back(ptrListSym); + addInputSection(listSec); createSymbolReference(parentSym, ptrListSym, linkAtOffset, infoCategoryWriter.catBodyInfo.relocTemplate); @@ -933,7 +911,6 @@ void ObjcCategoryMerger::emitAndLinkPointerList( infoCategoryWriter.catPtrListInfo.align); listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection; listSec->live = true; - addInputSection(listSec); listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection; @@ -949,6 +926,7 @@ void ObjcCategoryMerger::emitAndLinkPointerList( ptrListSym->used = true; parentSym->getObjectFile()->symbols.push_back(ptrListSym); + addInputSection(listSec); createSymbolReference(parentSym, ptrListSym, linkAtOffset, infoCategoryWriter.catBodyInfo.relocTemplate); @@ -974,7 +952,6 @@ ObjcCategoryMerger::emitCatListEntrySec(const std::string &forCategoryName, bodyData, infoCategoryWriter.catListInfo.align); newCatList->parent = infoCategoryWriter.catListInfo.outputSection; newCatList->live = true; - addInputSection(newCatList); newCatList->parent = infoCategoryWriter.catListInfo.outputSection; @@ -990,6 +967,7 @@ ObjcCategoryMerger::emitCatListEntrySec(const std::string &forCategoryName, catListSym->used = true; objFile->symbols.push_back(catListSym); + addInputSection(newCatList); return catListSym; } @@ -1012,7 +990,6 @@ Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name, bodyData, infoCategoryWriter.catBodyInfo.align); newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection; newBodySec->live = true; - addInputSection(newBodySec); std::string symName = objc::symbol_names::category + baseClassName + "(" + name + ")"; @@ -1025,6 +1002,7 @@ Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name, catBodySym->used = true; objFile->symbols.push_back(catBodySym); + addInputSection(newBodySec); createSymbolReference(catBodySym, nameSym, catLayout.nameOffset, infoCategoryWriter.catBodyInfo.relocTemplate); @@ -1245,7 +1223,6 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories( infoCategoryWriter.catListInfo.align); listSec->parent = infoCategoryWriter.catListInfo.outputSection; listSec->live = true; - addInputSection(listSec); std::string slotSymName = "<__objc_catlist slot for category "; slotSymName += nonErasedCatBody->getName(); @@ -1260,6 +1237,7 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories( catListSlotSym->used = true; objFile->symbols.push_back(catListSlotSym); + addInputSection(listSec); // Now link the category body into the newly created slot createSymbolReference(catListSlotSym, nonErasedCatBody, 0, diff --git a/lld/MachO/Relocations.cpp b/lld/MachO/Relocations.cpp index afe7f454e6a230..e8ede19d1fda87 100644 --- a/lld/MachO/Relocations.cpp +++ b/lld/MachO/Relocations.cpp @@ -31,6 +31,31 @@ InputSection *Reloc::getReferentInputSection() const { } } +StringRef Reloc::getReferentString() const { + if (auto *isec = referent.dyn_cast()) { + const auto *cisec = dyn_cast(isec); + assert(cisec && "referent must be a CStringInputSection"); + return cisec->getStringRefAtOffset(addend); + } + + auto *sym = dyn_cast(referent.get()); + assert(sym && "referent must be a Defined symbol"); + + auto *symIsec = sym->isec(); + auto symOffset = sym->value + addend; + + if (auto *s = dyn_cast_or_null(symIsec)) + return s->getStringRefAtOffset(symOffset); + + if (isa(symIsec)) { + auto strData = symIsec->data.slice(symOffset); + const char *pszData = reinterpret_cast(strData.data()); + return StringRef(pszData, strnlen(pszData, strData.size())); + } + + llvm_unreachable("unknown reference section in getReferentString"); +} + bool macho::validateSymbolRelocation(const Symbol *sym, const InputSection *isec, const Reloc &r) { const RelocAttrs &relocAttrs = target->getRelocAttrs(r.type); diff --git a/lld/MachO/Relocations.h b/lld/MachO/Relocations.h index 5f161c8fcbfde3..b2f621451349ee 100644 --- a/lld/MachO/Relocations.h +++ b/lld/MachO/Relocations.h @@ -69,6 +69,10 @@ struct Reloc { addend(addend), referent(referent) {} InputSection *getReferentInputSection() const; + + // Must point to an offset within a CStringInputSection or a + // ConcatInputSection. + llvm::StringRef getReferentString() const; }; bool validateSymbolRelocation(const Symbol *, const InputSection *, diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 3d77835d117efe..6b4ec4989ca4a1 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -2010,11 +2010,8 @@ void ObjCMethListSection::setUp() { while (methodNameOff < isec->data.size()) { const Reloc *reloc = isec->getRelocAt(methodNameOff); assert(reloc && "Relocation expected at method list name slot"); - auto *def = dyn_cast_or_null(reloc->referent.get()); - assert(def && "Expected valid Defined at method list name slot"); - auto *cisec = cast(def->isec()); - assert(cisec && "Expected method name to be in a CStringInputSection"); - auto methname = cisec->getStringRefAtOffset(def->value); + + StringRef methname = reloc->getReferentString(); if (!ObjCSelRefsHelper::getSelRef(methname)) ObjCSelRefsHelper::makeSelRef(methname); @@ -2114,19 +2111,23 @@ void ObjCMethListSection::writeRelativeOffsetForIsec( uint32_t &outSecOff, bool useSelRef) const { const Reloc *reloc = isec->getRelocAt(inSecOff); assert(reloc && "Relocation expected at __objc_methlist Offset"); - auto *def = dyn_cast_or_null(reloc->referent.get()); - assert(def && "Expected all syms in __objc_methlist to be defined"); - uint32_t symVA = def->getVA(); + uint32_t symVA = 0; if (useSelRef) { - auto *cisec = cast(def->isec()); - auto methname = cisec->getStringRefAtOffset(def->value); + StringRef methname = reloc->getReferentString(); ConcatInputSection *selRef = ObjCSelRefsHelper::getSelRef(methname); assert(selRef && "Expected all selector names to already be already be " "present in __objc_selrefs"); symVA = selRef->getVA(); assert(selRef->data.size() == sizeof(target->wordSize) && "Expected one selref per ConcatInputSection"); + } else if (reloc->referent.is()) { + auto *def = dyn_cast_or_null(reloc->referent.get()); + assert(def && "Expected all syms in __objc_methlist to be defined"); + symVA = def->getVA(); + } else { + auto *isec = reloc->referent.get(); + symVA = isec->getVA(reloc->addend); } uint32_t currentVA = isec->getVA() + outSecOff; diff --git a/lld/test/MachO/objc-category-merging-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s index 527493303c583e..b94799a57a4d85 100644 --- a/lld/test/MachO/objc-category-merging-minimal.s +++ b/lld/test/MachO/objc-category-merging-minimal.s @@ -9,7 +9,7 @@ ## Create our main testing dylib - linking against the fake dylib above # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal.o merge_cat_minimal.s # RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_no_merge.dylib a64_fakedylib.dylib merge_cat_minimal.o -# RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_merge.dylib -objc_category_merging a64_fakedylib.dylib merge_cat_minimal.o +# RUN: %lld -objc_relative_method_lists -arch arm64 -dylib -o merge_cat_minimal_merge.dylib -objc_category_merging a64_fakedylib.dylib merge_cat_minimal.o ## Now verify that the flag caused category merging to happen appropriatelly # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS @@ -17,7 +17,7 @@ ############ Test merging multiple categories into the base class ############ # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_base_class_minimal.o merge_base_class_minimal.s -# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_yes_merge.dylib -objc_category_merging merge_base_class_minimal.o merge_cat_minimal.o +# RUN: %lld -arch arm64 -dylib -objc_relative_method_lists -o merge_base_class_minimal_yes_merge.dylib -objc_category_merging merge_base_class_minimal.o merge_cat_minimal.o # RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_no_merge.dylib merge_base_class_minimal.o merge_cat_minimal.o # RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_INTO_BASE @@ -37,14 +37,14 @@ MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02 MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass(Category01|Category02) MERGE_CATS-NEXT: name {{.*}} Category01|Category02 MERGE_CATS: instanceMethods -MERGE_CATS-NEXT: 24 -MERGE_CATS-NEXT: 2 +MERGE_CATS-NEXT: entsize 12 (relative) +MERGE_CATS-NEXT: count 2 MERGE_CATS-NEXT: name {{.*}} cat01_InstanceMethod MERGE_CATS-NEXT: types {{.*}} v16@0:8 -MERGE_CATS-NEXT: imp -[MyBaseClass(Category01) cat01_InstanceMethod] +MERGE_CATS-NEXT: imp {{.*}} -[MyBaseClass(Category01) cat01_InstanceMethod] MERGE_CATS-NEXT: name {{.*}} cat02_InstanceMethod MERGE_CATS-NEXT: types {{.*}} v16@0:8 -MERGE_CATS-NEXT: imp -[MyBaseClass(Category02) cat02_InstanceMethod] +MERGE_CATS-NEXT: imp {{.*}} -[MyBaseClass(Category02) cat02_InstanceMethod] MERGE_CATS-NEXT: classMethods 0x0 MERGE_CATS-NEXT: protocols 0x0 MERGE_CATS-NEXT: instanceProperties 0x0 @@ -69,17 +69,17 @@ YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02 YES_MERGE_INTO_BASE: _OBJC_CLASS_$_MyBaseClass YES_MERGE_INTO_BASE-NEXT: _OBJC_METACLASS_$_MyBaseClass YES_MERGE_INTO_BASE: baseMethods -YES_MERGE_INTO_BASE-NEXT: entsize 24 +YES_MERGE_INTO_BASE-NEXT: entsize 12 (relative) YES_MERGE_INTO_BASE-NEXT: count 3 YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat01_InstanceMethod YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 -YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category01) cat01_InstanceMethod] +YES_MERGE_INTO_BASE-NEXT: imp {{.*}} -[MyBaseClass(Category01) cat01_InstanceMethod] YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat02_InstanceMethod YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 -YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category02) cat02_InstanceMethod] +YES_MERGE_INTO_BASE-NEXT: imp {{.*}} -[MyBaseClass(Category02) cat02_InstanceMethod] YES_MERGE_INTO_BASE-NEXT: name {{.*}} baseInstanceMethod YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8 -YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass baseInstanceMethod] +YES_MERGE_INTO_BASE-NEXT: imp {{.*}} -[MyBaseClass baseInstanceMethod] #### Check merge swift category into base class ### diff --git a/lld/test/MachO/objc-relative-method-lists-simple-x86.s b/lld/test/MachO/objc-relative-method-lists-simple-x86.s new file mode 100644 index 00000000000000..8ad9c0f5f60f82 --- /dev/null +++ b/lld/test/MachO/objc-relative-method-lists-simple-x86.s @@ -0,0 +1,255 @@ +# REQUIRES: x86 +# UNSUPPORTED: target=arm{{.*}}-unknown-linux-gnueabihf +# RUN: rm -rf %t; split-file %s %t && cd %t + +## Compile rel_dylib.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos -o rel_dylib.o simple_class.s + +## Test relative method lists +# RUN: %no-lsystem-lld rel_dylib.o -o rel_dylib.dylib -map rel_dylib.map -dylib -objc_relative_method_lists +# RUN: llvm-objdump --macho --objc-meta-data rel_dylib.dylib | FileCheck %s --check-prefix=CHK_REL + +## Test relative method lists + dead-strip +# RUN: %no-lsystem-lld rel_dylib.o -o rel_dylib.dylib -map rel_dylib.map -dylib -objc_relative_method_lists -dead_strip +# RUN: llvm-objdump --macho --objc-meta-data rel_dylib.dylib | FileCheck %s --check-prefix=CHK_REL + +## Test traditional method lists (no relative offsets) +# RUN: %no-lsystem-lld rel_dylib.o -o rel_dylib.dylib -map rel_dylib.map -dylib -no_objc_relative_method_lists +# RUN: llvm-objdump --macho --objc-meta-data rel_dylib.dylib | FileCheck %s --check-prefix=CHK_NO_REL + + +CHK_REL: Contents of (__DATA_CONST,__objc_classlist) section +CHK_REL-NEXT: _OBJC_CLASS_$_MyClass +CHK_REL: baseMethods +CHK_REL-NEXT: entsize 12 (relative) +CHK_REL-NEXT: count 3 +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_00 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_00] +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_01 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_01] +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) instance_method_02 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) -[MyClass instance_method_02] + +CHK_REL: Meta Class +CHK_REL-NEXT: isa 0x{{[0-9a-f]*}} _OBJC_METACLASS_$_MyClass +CHK_REL: baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *) +CHK_REL-NEXT: entsize 12 (relative) +CHK_REL-NEXT: count 3 +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) class_method_00 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) +[MyClass class_method_00] +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) class_method_01 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) +[MyClass class_method_01] +CHK_REL-NEXT: name 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) class_method_02 +CHK_REL-NEXT: types 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) v16@0:8 +CHK_REL-NEXT: imp 0x{{[0-9a-f]*}} (0x{{[0-9a-f]*}}) +[MyClass class_method_02] + + +CHK_NO_REL-NOT: (relative) + +CHK_NO_REL: Contents of (__DATA_CONST,__objc_classlist) section +CHK_NO_REL-NEXT: _OBJC_CLASS_$_MyClass + +CHK_NO_REL: baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *) +CHK_NO_REL-NEXT: entsize 24 +CHK_NO_REL-NEXT: count 3 +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} instance_method_00 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp -[MyClass instance_method_00] +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} instance_method_01 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp -[MyClass instance_method_01] +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} instance_method_02 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp -[MyClass instance_method_02] + + +CHK_NO_REL: Meta Class +CHK_NO_REL-NEXT: _OBJC_METACLASS_$_MyClass + +CHK_NO_REL: baseMethods 0x{{[0-9a-f]*}} (struct method_list_t *) +CHK_NO_REL-NEXT: entsize 24 +CHK_NO_REL-NEXT: count 3 +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} class_method_00 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp +[MyClass class_method_00] +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} class_method_01 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp +[MyClass class_method_01] +CHK_NO_REL-NEXT: name 0x{{[0-9a-f]*}} class_method_02 +CHK_NO_REL-NEXT: types 0x{{[0-9a-f]*}} v16@0:8 +CHK_NO_REL-NEXT: imp +[MyClass class_method_02] + + +######################## Generate simple_class.s ######################### +# clang -c simple_class.mm -s -o simple_class.s -target x86_64-apple-macos11 -Oz + +######################## simple_class.mm ######################## +# __attribute__((objc_root_class)) +# @interface MyClass +# - (void)instance_method_00; +# - (void)instance_method_01; +# - (void)instance_method_02; +# + (void)class_method_00; +# + (void)class_method_01; +# + (void)class_method_02; +# @end +# +# @implementation MyClass +# - (void)instance_method_00 {} +# - (void)instance_method_01 {} +# - (void)instance_method_02 {} +# + (void)class_method_00 {} +# + (void)class_method_01 {} +# + (void)class_method_02 {} +# @end +# +# void *_objc_empty_cache; +# + +#--- objc-macros.s +.macro .objc_selector_def name + .p2align 2 +"\name": + .cfi_startproc + ret + .cfi_endproc +.endm + +#--- simple_class.s +.include "objc-macros.s" + +.section __TEXT,__text,regular,pure_instructions +.build_version macos, 11, 0 + +.objc_selector_def "-[MyClass instance_method_00]" +.objc_selector_def "-[MyClass instance_method_01]" +.objc_selector_def "-[MyClass instance_method_02]" + +.objc_selector_def "+[MyClass class_method_00]" +.objc_selector_def "+[MyClass class_method_01]" +.objc_selector_def "+[MyClass class_method_02]" + +.section __DATA,__objc_data +.globl _OBJC_CLASS_$_MyClass +.p2align 3, 0x0 +_OBJC_CLASS_$_MyClass: + .quad _OBJC_METACLASS_$_MyClass + .quad 0 + .quad __objc_empty_cache + .quad 0 + .quad __OBJC_CLASS_RO_$_MyClass + + .globl _OBJC_METACLASS_$_MyClass + .p2align 3, 0x0 +_OBJC_METACLASS_$_MyClass: + .quad _OBJC_METACLASS_$_MyClass + .quad _OBJC_CLASS_$_MyClass + .quad __objc_empty_cache + .quad 0 + .quad __OBJC_METACLASS_RO_$_MyClass + + .section __TEXT,__objc_classname,cstring_literals +L_OBJC_CLASS_NAME_: + .asciz "MyClass" + + .section __TEXT,__objc_methname,cstring_literals +L_OBJC_METH_VAR_NAME_: + .asciz "class_method_00" + + .section __TEXT,__objc_methtype,cstring_literals +L_OBJC_METH_VAR_TYPE_: + .asciz "v16@0:8" + + .section __TEXT,__objc_methname,cstring_literals +L_OBJC_METH_VAR_NAME_.1: + .asciz "class_method_01" + +L_OBJC_METH_VAR_NAME_.2: + .asciz "class_method_02" + + .section __DATA,__objc_const + .p2align 3, 0x0 +__OBJC_$_CLASS_METHODS_MyClass: + .long 24 + .long 3 + .quad L_OBJC_METH_VAR_NAME_ + .quad L_OBJC_METH_VAR_TYPE_ + .quad "+[MyClass class_method_00]" + .quad L_OBJC_METH_VAR_NAME_.1 + .quad L_OBJC_METH_VAR_TYPE_ + .quad "+[MyClass class_method_01]" + .quad L_OBJC_METH_VAR_NAME_.2 + .quad L_OBJC_METH_VAR_TYPE_ + .quad "+[MyClass class_method_02]" + + .p2align 3, 0x0 +__OBJC_METACLASS_RO_$_MyClass: + .long 3 + .long 40 + .long 40 + .space 4 + .quad 0 + .quad L_OBJC_CLASS_NAME_ + .quad __OBJC_$_CLASS_METHODS_MyClass + .quad 0 + .quad 0 + .quad 0 + .quad 0 + + .section __TEXT,__objc_methname,cstring_literals +L_OBJC_METH_VAR_NAME_.3: + .asciz "instance_method_00" + +L_OBJC_METH_VAR_NAME_.4: + .asciz "instance_method_01" + +L_OBJC_METH_VAR_NAME_.5: + .asciz "instance_method_02" + + .section __DATA,__objc_const + .p2align 3, 0x0 +__OBJC_$_INSTANCE_METHODS_MyClass: + .long 24 + .long 3 + .quad L_OBJC_METH_VAR_NAME_.3 + .quad L_OBJC_METH_VAR_TYPE_ + .quad "-[MyClass instance_method_00]" + .quad L_OBJC_METH_VAR_NAME_.4 + .quad L_OBJC_METH_VAR_TYPE_ + .quad "-[MyClass instance_method_01]" + .quad L_OBJC_METH_VAR_NAME_.5 + .quad L_OBJC_METH_VAR_TYPE_ + .quad "-[MyClass instance_method_02]" + + .p2align 3, 0x0 +__OBJC_CLASS_RO_$_MyClass: + .long 2 + .long 0 + .long 0 + .space 4 + .quad 0 + .quad L_OBJC_CLASS_NAME_ + .quad __OBJC_$_INSTANCE_METHODS_MyClass + .quad 0 + .quad 0 + .quad 0 + .quad 0 + + .globl __objc_empty_cache +.zerofill __DATA,__common,__objc_empty_cache,8,3 + .section __DATA,__objc_classlist,regular,no_dead_strip + .p2align 3, 0x0 +l_OBJC_LABEL_CLASS_$: + .quad _OBJC_CLASS_$_MyClass + + .section __DATA,__objc_imageinfo,regular,no_dead_strip +L_OBJC_IMAGE_INFO: + .long 0 + .long 64 + +.subsections_via_symbols diff --git a/lldb/CodeOwners.rst b/lldb/CodeOwners.rst index 52e3e550523e5b..3c10c2a28da9e7 100644 --- a/lldb/CodeOwners.rst +++ b/lldb/CodeOwners.rst @@ -17,7 +17,7 @@ assistance. All parts of LLDB not covered by someone else ---------------------------------------------- | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) Components ---------- @@ -27,100 +27,100 @@ LLDB. ABI ~~~ | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) | David Spickett -| david.spickett\@linaro.org (email), DavidSpickett (Phabricator), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) +| david.spickett\@linaro.org (email), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) Breakpoint ~~~~~~~~~~ | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) CMake & Build System ~~~~~~~~~~~~~~~~~~~~ | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) | Alex Langford -| alangford\@apple.com (email), bulbazord (Phabricator), bulbazord (GitHub), bulbazord (Discourse), bulba_zord (Discord) +| alangford\@apple.com (email), bulbazord (GitHub), bulbazord (Discourse), bulba_zord (Discord) Commands ~~~~~~~~ | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) Expression Parser ~~~~~~~~~~~~~~~~~ | Michael Buch -| michaelbuch12\@gmail.com (email), Michael137 (Phabricator), Michael137 (GitHub), Michael137 (Discourse) +| michaelbuch12\@gmail.com (email), Michael137 (GitHub), Michael137 (Discourse) | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) Interpreter ~~~~~~~~~~~ | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) | Greg Clayton -| gclayton\@fb.com (email), clayborg (Phabricator), clayborg (GitHub), clayborg (Discourse) +| gclayton\@fb.com (email), clayborg (GitHub), clayborg (Discourse) Lua ~~~ | Jonas Delvieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) Python ~~~~~~ | Med Ismail Bennani -| ismail\@bennani.ma (email), mib (Phabricator), medismailben (GitHub), mib (Discourse), mib#8727 (Discord) +| ismail\@bennani.ma (email), medismailben (GitHub), mib (Discourse), mib#8727 (Discord) Target/Process Control ~~~~~~~~~~~~~~~~~~~~~~ | Med Ismail Bennani -| ismail\@bennani.ma (email), mib (Phabricator), medismailben (GitHub), mib (Discourse), mib#8727 (Discord) +| ismail\@bennani.ma (email), medismailben (GitHub), mib (Discourse), mib#8727 (Discord) | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) Test Suite ~~~~~~~~~~ | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) Trace ~~~~~ | Walter Erquinigo -| a20012251\@gmail.com (email), wallace (Phabricator), walter-erquinigo (GitHub), wallace (Discourse), werquinigo (Discord) +| a20012251\@gmail.com (email), walter-erquinigo (GitHub), wallace (Discourse), werquinigo (Discord) Unwinding ~~~~~~~~~ | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) Utility ~~~~~~~ | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) ValueObject ~~~~~~~~~~~ | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) Watchpoints ~~~~~~~~~~~ | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) File Formats ------------ @@ -130,54 +130,54 @@ info formats. (PE)COFF ~~~~~~~~ | Saleem Abdulrasool -| compnerd\@compnerd.org (email), compnerd (Phabricator), compnerd (GitHub), compnerd (Discourse), compnerd (Discord) +| compnerd\@compnerd.org (email), compnerd (GitHub), compnerd (Discourse), compnerd (Discord) Breakpad ~~~~~~~~ | Zequan Wu -| zequanwu\@google.com (email), zequanwu (Phabricator), ZequanWu (GitHub), ZequanWu (Discourse) +| zequanwu\@google.com (email), ZequanWu (GitHub), ZequanWu (Discourse) | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) CTF ~~~ | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) DWARF ~~~~~ | Adrian Prantl -| aprantl\@apple.com (email), aprantl (Phabricator), adrian-prantl (GitHub), adrian.prantl (Discourse), adrian.prantl (Discord), Adrian Prantl#4366 (Discourse) +| aprantl\@apple.com (email), adrian-prantl (GitHub), adrian.prantl (Discourse), adrian.prantl (Discord), Adrian Prantl#4366 (Discourse) | Greg Clayton -| gclayton\@fb.com (email), clayborg (Phabricator), clayborg (GitHub), clayborg (Discourse) +| gclayton\@fb.com (email), clayborg (GitHub), clayborg (Discourse) ELF ~~~ | David Spickett -| david.spickett\@linaro.org (email), DavidSpickett (Phabricator), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) +| david.spickett\@linaro.org (email), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) JSON ~~~~ | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) MachO ~~~~~ | Greg Clayton -| gclayton\@fb.com (email), clayborg (Phabricator), clayborg (GitHub), clayborg (Discourse) +| gclayton\@fb.com (email), clayborg (GitHub), clayborg (Discourse) | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) PDB ~~~ | Zequan Wu -| zequanwu\@google.com (email), zequanwu (Phabricator), ZequanWu (GitHub), ZequanWu (Discourse) +| zequanwu\@google.com (email), ZequanWu (GitHub), ZequanWu (Discourse) Platforms --------- @@ -186,36 +186,36 @@ The following people are responsible for decisions involving platforms. Android ~~~~~~~ | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) Darwin ~~~~~~ | Jim Ingham -| jingham\@apple.com (email), jingham (Phabricator), jimingham (GitHub), jingham (Discourse) +| jingham\@apple.com (email), jimingham (GitHub), jingham (Discourse) | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) | Jonas Devlieghere -| jonas\@devlieghere.com (email), jdevlieghere (Phabricator), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) +| jonas\@devlieghere.com (email), jdevlieghere (GitHub), jdevlieghere (Discourse), jdevlieghere (Discord) FreeBSD ~~~~~~~ | Ed Maste -| emaste\@freebsd.org (email), emaste (Phabricator), emaste (GitHub), emaste (Discourse), emaste (Discord) +| emaste\@freebsd.org (email), emaste (GitHub), emaste (Discourse), emaste (Discord) Linux ~~~~~ | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) | David Spickett -| david.spickett\@linaro.org (email), DavidSpickett (Phabricator), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) +| david.spickett\@linaro.org (email), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) Windows ~~~~~~~ | Omair Javaid -| omair.javaid\@linaro.org (email), omjavaid (Phabricator), omjavaid (GitHub), omjavaid (Discourse), omjavaid#9902 (Discord) +| omair.javaid\@linaro.org (email), omjavaid (GitHub), omjavaid (Discourse), omjavaid#9902 (Discord) Tools ----- @@ -224,23 +224,23 @@ The following people are responsible for decisions involving specific tools. debugserver ~~~~~~~~~~~ | Jason Molenda -| jmolenda\@apple.com (email), jasonmolenda (Phabricator), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) +| jmolenda\@apple.com (email), jasonmolenda (GitHub), jasonmolenda (Discourse), jasonmolenda (Discord) lldb-server ~~~~~~~~~~~ | David Spickett -| david.spickett\@linaro.org (email), DavidSpickett (Phabricator), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) +| david.spickett\@linaro.org (email), DavidSpickett (GitHub), DavidSpickett (Discourse), davidspickett (Discord) | Pavel Labath -| pavel\@labath.sk (email), labath (Phabricator), labath (GitHub), labath (Discourse) +| pavel\@labath.sk (email), labath (GitHub), labath (Discourse) lldb-dap ~~~~~~~~ | Greg Clayton -| gclayton\@fb.com (email), clayborg (Phabricator), clayborg (GitHub), clayborg (Discourse) +| gclayton\@fb.com (email), clayborg (GitHub), clayborg (Discourse) | Walter Erquinigo -| a20012251\@gmail.com (email), wallace (Phabricator), walter-erquinigo (GitHub), wallace (Discourse), werquinigo (Discord) +| a20012251\@gmail.com (email), walter-erquinigo (GitHub), wallace (Discourse), werquinigo (Discord) Former Code Owners ================== diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm index f96e2cf80c5fac..d27bd1b7426e6c 100644 --- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm +++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm @@ -124,6 +124,12 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) { return g_program_filespec; } +/// Resolve the given candidate support dir and return true if it's valid. +static bool ResolveAndVerifyCandidateSupportDir(FileSpec &path) { + FileSystem::Instance().Resolve(path); + return FileSystem::Instance().IsDirectory(path); +} + bool HostInfoMacOSX::ComputeSupportExeDirectory(FileSpec &file_spec) { FileSpec lldb_file_spec = GetShlibDir(); if (!lldb_file_spec) @@ -144,16 +150,24 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) { #endif } else { // Find the bin path relative to the lib path where the cmake-based - // OS X .dylib lives. This is not going to work if the bin and lib - // dir are not both in the same dir. + // OS X .dylib lives. We try looking first at a possible sibling `bin` + // directory, and then at the `lib` directory itself. This last case is + // useful for supporting build systems like Bazel which in many cases prefer + // to place support binaries right next to dylibs. // - // It is not going to work to do it by the executable path either, + // It is not going to work to do it by the executable path, // as in the case of a python script, the executable is python, not // the lldb driver. - raw_path.append("/../bin"); - FileSpec support_dir_spec(raw_path); - FileSystem::Instance().Resolve(support_dir_spec); - if (!FileSystem::Instance().IsDirectory(support_dir_spec)) { + FileSpec support_dir_spec_lib(raw_path); + FileSpec support_dir_spec_bin = + support_dir_spec_lib.CopyByAppendingPathComponent("/../bin"); + FileSpec support_dir_spec; + + if (ResolveAndVerifyCandidateSupportDir(support_dir_spec_bin)) { + support_dir_spec = support_dir_spec_bin; + } else if (ResolveAndVerifyCandidateSupportDir(support_dir_spec_lib)) { + support_dir_spec = support_dir_spec_lib; + } else { Log *log = GetLog(LLDBLog::Host); LLDB_LOG(log, "failed to find support directory"); return false; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 0ee4d7b444cfcf..5e5e9b9e8a93b1 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16131,6 +16131,96 @@ The returned value is completely identical to the input except for the sign bit; in particular, if the input is a NaN, then the quiet/signaling bit and payload are perfectly preserved. +.. _i_fminmax_family: + +'``llvm.min.*``' Intrinsics Comparation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Standard: +""""""""" + +IEEE754 and ISO C define some min/max operations, and they have some differences +on working with qNaN/sNaN and +0.0/-0.0. Here is the list: + +.. list-table:: + :header-rows: 2 + + * - ``ISO C`` + - fmin/fmax + - fmininum/fmaximum + - fminimum_num/fmaximum_num + + * - ``IEEE754`` + - minNum/maxNum (2008) + - minimum/maximum (2019) + - minimumNumber/maximumNumber (2019) + + * - ``+0.0 vs -0.0`` + - either one + - +0.0 > -0.0 + - +0.0 > -0.0 + + * - ``NUM vs sNaN`` + - qNaN, invalid exception + - qNaN, invalid exception + - NUM, invalid exception + + * - ``qNaN vs sNaN`` + - qNaN, invalid exception + - qNaN, invalid exception + - qNaN, invalid exception + + * - ``NUM vs qNaN`` + - NUM, no exception + - qNaN, no exception + - NUM, no exception + +LLVM Implementation: +"""""""""""""""""""" + +LLVM implements all ISO C flavors as listed in this table, except in the +default floating-point environment exceptions are ignored. The constrained +versions of the intrinsics respect the exception behavior. + +.. list-table:: + :header-rows: 1 + :widths: 16 28 28 28 + + * - Operation + - minnum/maxnum + - minimum/maximum + - minimumnum/maximumnum + + * - ``NUM vs qNaN`` + - NUM, no exception + - qNaN, no exception + - NUM, no exception + + * - ``NUM vs sNaN`` + - qNaN, invalid exception + - qNaN, invalid exception + - NUM, invalid exception + + * - ``qNaN vs sNaN`` + - qNaN, invalid exception + - qNaN, invalid exception + - qNaN, invalid exception + + * - ``sNaN vs sNaN`` + - qNaN, invalid exception + - qNaN, invalid exception + - qNaN, invalid exception + + * - ``+0.0 vs -0.0`` + - either one + - +0.0(max)/-0.0(min) + - +0.0(max)/-0.0(min) + + * - ``NUM vs NUM`` + - larger(max)/smaller(min) + - larger(max)/smaller(min) + - larger(max)/smaller(min) + .. _i_minnum: '``llvm.minnum.*``' Intrinsic @@ -16312,6 +16402,98 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this intrinsic. Note that these are the semantics specified in the draft of IEEE 754-2019. +.. _i_minimumnum: + +'``llvm.minimumnum.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimumnum.f32(float %Val0, float %Val1) + declare double @llvm.minimumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +""""""""" + +The '``llvm.minimumnum.*``' intrinsics return the minimum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"""""""""" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"""""""""" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the lesser of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in IEEE 754-2019. + +It has some differences with '``llvm.minnum.*``': +1)'``llvm.minnum.*``' will return qNaN if either operand is sNaN. +2)'``llvm.minnum*``' may return either one if we compare +0.0 vs -0.0. + +.. _i_maximumnum: + +'``llvm.maximumnum.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.maximumnum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.maximumnum.f32(float %Val0, float %Val1) + declare double @llvm.maximumnum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.maximumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.maximumnum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.maximumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +""""""""" + +The '``llvm.maximumnum.*``' intrinsics return the maximum of the two +arguments, not propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"""""""""" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"""""""""" +If both operands are NaNs (including sNaN), returns qNaN. If one operand +is NaN (including sNaN) and another operand is a number, return the number. +Otherwise returns the greater of the two arguments. -0.0 is considered to +be less than +0.0 for this intrinsic. + +Note that these are the semantics of maximumNumber specified in IEEE 754-2019. + +It has some differences with '``llvm.maxnum.*``': +1)'``llvm.maxnum.*``' will return qNaN if either operand is sNaN. +2)'``llvm.maxnum*``' may return either one if we compare +0.0 vs -0.0. + .. _int_copysign: '``llvm.copysign.*``' Intrinsic diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index b2839b4348336a..872dedf8a82def 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -287,6 +287,77 @@ The ``@llvm.nvvm.fence.proxy.tensormap_generic.*`` is a uni-directional fence us The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand addr must fall within the ``.global`` state space. Otherwise, the behavior is undefined. For more information, see `PTX ISA `_. +Arithmetic Intrinsics +--------------------- + +'``llvm.nvvm.idp2a.[us].[us]``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c) + declare i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c) + declare i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c) + declare i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 immarg %is.hi, i32 %c) + + +Overview: +""""""""" + +The '``llvm.nvvm.idp2a.[us].[us]``' intrinsics performs a 2-element vector dot +product followed by addition. They corresponds directly to the ``dp2a`` PTX +instruction. + +Semantics: +"""""""""" + +The 32-bit value in ``%a`` is broken into 2 16-bit values which are extended to +32 bits. For the '``llvm.nvvm.idp2a.u.[us]``' variants zero-extension is used, +while for the '``llvm.nvvm.idp2a.s.[us]``' sign-extension is used. Two bytes are +selected from ``%b``, if ``%is.hi`` is true, the most significant bytes are +selected, otherwise the least significant bytes are selected. These bytes are +then extended to 32-bits. For the '``llvm.nvvm.idp2a.[us].u``' variants +zero-extension is used, while for the '``llvm.nvvm.idp2a.[us].s``' +sign-extension is used. The dot product of these 2-element vectors is added to +``%c`` to produce the return. + + +'``llvm.nvvm.idp4a.[us].[us]``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.idp4a.s.s(i32 %a, i32 %b, i32 %c) + declare i32 @llvm.nvvm.idp4a.s.u(i32 %a, i32 %b, i32 %c) + declare i32 @llvm.nvvm.idp4a.u.s(i32 %a, i32 %b, i32 %c) + declare i32 @llvm.nvvm.idp4a.u.u(i32 %a, i32 %b, i32 %c) + +Overview: +""""""""" + +The '``llvm.nvvm.idp4a.[us].[us]``' intrinsics perform a 4-element vector dot +product followed by addition. They corresponds directly to the ``dp4a`` PTX +instruction. + +Semantics: +"""""""""" + +Each of the 4 bytes in both ``%a`` and ``%b`` are extended to 32-bit integers +forming 2 ``<4 x i32>``. For ``%a``, zero-extension is used in the +'``llvm.nvvm.idp4a.u.[us]``' variants, while sign-extension is used with +'``llvm.nvvm.idp4a.s.[us]``' variants. Similarly, for ``%b``, zero-extension is +used in the '``llvm.nvvm.idp4a.[us].u``' variants, while sign-extension is used +with '``llvm.nvvm.idp4a.[us].s``' variants. The dot product of these 4-element +vectors is added to ``%c`` to produce the return. + + + Other Intrinsics ---------------- diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 231de56ef4cfee..41d1388e5bf7e9 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -164,20 +164,20 @@ rarely have to include this file directly). efficient to use the ``InstVisitor`` class to dispatch over the instruction type directly. -``isa_and_nonnull<>``: - The ``isa_and_nonnull<>`` operator works just like the ``isa<>`` operator, +``isa_and_present<>``: + The ``isa_and_present<>`` operator works just like the ``isa<>`` operator, except that it allows for a null pointer as an argument (which it then returns false). This can sometimes be useful, allowing you to combine several null checks into one. -``cast_or_null<>``: - The ``cast_or_null<>`` operator works just like the ``cast<>`` operator, +``cast_if_present<>``: + The ``cast_if_present<>`` operator works just like the ``cast<>`` operator, except that it allows for a null pointer as an argument (which it then propagates). This can sometimes be useful, allowing you to combine several null checks into one. -``dyn_cast_or_null<>``: - The ``dyn_cast_or_null<>`` operator works just like the ``dyn_cast<>`` +``dyn_cast_if_present<>``: + The ``dyn_cast_if_present<>`` operator works just like the ``dyn_cast<>`` operator, except that it allows for a null pointer as an argument (which it then propagates). This can sometimes be useful, allowing you to combine several null checks into one. diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h index d77c81d03582e1..f0e2aeb0f92f74 100644 --- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h +++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h @@ -9,10 +9,10 @@ #ifndef LLVM_ANALYSIS_CTXPROFANALYSIS_H #define LLVM_ANALYSIS_CTXPROFANALYSIS_H +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/PassManager.h" #include "llvm/ProfileData/PGOCtxProfReader.h" -#include namespace llvm { @@ -20,12 +20,28 @@ class CtxProfAnalysis; /// The instrumented contextual profile, produced by the CtxProfAnalysis. class PGOContextualProfile { + friend class CtxProfAnalysis; + friend class CtxProfAnalysisPrinterPass; + struct FunctionInfo { + uint32_t NextCounterIndex = 0; + uint32_t NextCallsiteIndex = 0; + const std::string Name; + + FunctionInfo(StringRef Name) : Name(Name) {} + }; std::optional Profiles; + // For the GUIDs in this module, associate metadata about each function which + // we'll need when we maintain the profiles during IPO transformations. + DenseMap FuncInfo; -public: - explicit PGOContextualProfile(PGOCtxProfContext::CallTargetMapTy &&Profiles) - : Profiles(std::move(Profiles)) {} + /// Get the GUID of this Function if it's defined in this module. + GlobalValue::GUID getDefinedFunctionGUID(const Function &F) const; + + // This is meant to be constructed from CtxProfAnalysis, which will also set + // its state piecemeal. PGOContextualProfile() = default; + +public: PGOContextualProfile(const PGOContextualProfile &) = delete; PGOContextualProfile(PGOContextualProfile &&) = default; @@ -35,6 +51,20 @@ class PGOContextualProfile { return *Profiles; } + bool isFunctionKnown(const Function &F) const { + return getDefinedFunctionGUID(F) != 0; + } + + uint32_t allocateNextCounterIndex(const Function &F) { + assert(isFunctionKnown(F)); + return FuncInfo.find(getDefinedFunctionGUID(F))->second.NextCounterIndex++; + } + + uint32_t allocateNextCallsiteIndex(const Function &F) { + assert(isFunctionKnown(F)); + return FuncInfo.find(getDefinedFunctionGUID(F))->second.NextCallsiteIndex++; + } + bool invalidate(Module &, const PreservedAnalyses &PA, ModuleAnalysisManager::Invalidator &) { // Check whether the analysis has been explicitly invalidated. Otherwise, @@ -66,5 +96,27 @@ class CtxProfAnalysisPrinterPass PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); static bool isRequired() { return true; } }; + +/// Assign a GUID to functions as metadata. GUID calculation takes linkage into +/// account, which may change especially through and after thinlto. By +/// pre-computing and assigning as metadata, this mechanism is resilient to such +/// changes (as well as name changes e.g. suffix ".llvm." additions). + +// FIXME(mtrofin): we can generalize this mechanism to calculate a GUID early in +// the pass pipeline, associate it with any Global Value, and then use it for +// PGO and ThinLTO. +// At that point, this should be moved elsewhere. +class AssignGUIDPass : public PassInfoMixin { +public: + explicit AssignGUIDPass() = default; + + /// Assign a GUID *if* one is not already assign, as a function metadata named + /// `GUIDMetadataName`. + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + static const char *GUIDMetadataName; + // This should become GlobalValue::getGUID + static uint64_t getGUID(const Function &F); +}; + } // namespace llvm #endif // LLVM_ANALYSIS_CTXPROFANALYSIS_H diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h index ed9fade3f14f31..3ba0ae5de61d51 100644 --- a/llvm/include/llvm/Analysis/DXILResource.h +++ b/llvm/include/llvm/Analysis/DXILResource.h @@ -9,24 +9,29 @@ #ifndef LLVM_ANALYSIS_DXILRESOURCE_H #define LLVM_ANALYSIS_DXILRESOURCE_H +#include "llvm/ADT/MapVector.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/Pass.h" #include "llvm/Support/DXILABI.h" namespace llvm { +class CallInst; class MDTuple; +class TargetExtType; namespace dxil { class ResourceInfo { struct ResourceBinding { - uint32_t UniqueID; + uint32_t RecordID; uint32_t Space; uint32_t LowerBound; uint32_t Size; bool operator==(const ResourceBinding &RHS) const { - return std::tie(UniqueID, Space, LowerBound, Size) == - std::tie(RHS.UniqueID, RHS.Space, RHS.LowerBound, RHS.Size); + return std::tie(RecordID, Space, LowerBound, Size) == + std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHS.Size); } bool operator!=(const ResourceBinding &RHS) const { return !(*this == RHS); @@ -124,9 +129,9 @@ class ResourceInfo { bool isFeedback() const; bool isMultiSample() const; - void bind(uint32_t UniqueID, uint32_t Space, uint32_t LowerBound, + void bind(uint32_t RecordID, uint32_t Space, uint32_t LowerBound, uint32_t Size) { - Binding.UniqueID = UniqueID; + Binding.RecordID = RecordID; Binding.Space = Space; Binding.LowerBound = LowerBound; Binding.Size = Size; @@ -211,9 +216,60 @@ class ResourceInfo { ResourceBinding getBinding() const { return Binding; } std::pair getAnnotateProps() const; + + void print(raw_ostream &OS) const; }; } // namespace dxil + +using DXILResourceMap = MapVector; + +class DXILResourceAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + + static AnalysisKey Key; + +public: + using Result = DXILResourceMap; + + /// Gather resource info for the module \c M. + DXILResourceMap run(Module &M, ModuleAnalysisManager &AM); +}; + +/// Printer pass for the \c DXILResourceAnalysis results. +class DXILResourcePrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + +class DXILResourceWrapperPass : public ModulePass { + std::unique_ptr ResourceMap; + +public: + static char ID; // Class identification, replacement for typeinfo + + DXILResourceWrapperPass(); + ~DXILResourceWrapperPass() override; + + const DXILResourceMap &getResourceMap() const { return *ResourceMap; } + DXILResourceMap &getResourceMap() { return *ResourceMap; } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M) override; + void releaseMemory() override; + + void print(raw_ostream &OS, const Module *M) const override; + void dump() const; +}; + +ModulePass *createDXILResourceWrapperPassPass(); + } // namespace llvm #endif // LLVM_ANALYSIS_DXILRESOURCE_H diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index 7be5bb04549c61..e1cb1e5c557eae 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -1388,6 +1388,39 @@ TLI_DEFINE_ENUM_INTERNAL(fminl) TLI_DEFINE_STRING_INTERNAL("fminl") TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) +// Calls to fmaximum_num and fminimum_num library functions expand to the llvm.maximumnum and +// llvm.minimumnum intrinsics with the correct parameter types for the arguments +// (all types must match). +/// double fmaximum_num(double x, double y); +TLI_DEFINE_ENUM_INTERNAL(fmaximum_num) +TLI_DEFINE_STRING_INTERNAL("fmaximum_num") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + +/// float fmaximum_numf(float x, float y); +TLI_DEFINE_ENUM_INTERNAL(fmaximum_numf) +TLI_DEFINE_STRING_INTERNAL("fmaximum_numf") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + +/// long double fmaximum_numl(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(fmaximum_numl) +TLI_DEFINE_STRING_INTERNAL("fmaximum_numl") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + +/// double fminimum_num(double x, double y); +TLI_DEFINE_ENUM_INTERNAL(fminimum_num) +TLI_DEFINE_STRING_INTERNAL("fminimum_num") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + +/// float fminimum_numf(float x, float y); +TLI_DEFINE_ENUM_INTERNAL(fminimum_numf) +TLI_DEFINE_STRING_INTERNAL("fminimum_numf") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + +/// long double fminimum_numl(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(fminimum_numl) +TLI_DEFINE_STRING_INTERNAL("fminimum_numl") +TLI_DEFINE_SIG_INTERNAL(Floating, Same, Same) + /// double fmod(double x, double y); TLI_DEFINE_ENUM_INTERNAL(fmod) TLI_DEFINE_STRING_INTERNAL("fmod") diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 279cfb5aa47d6f..77ddc10e8a0e76 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2037,6 +2037,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::maximum: ISD = ISD::FMAXIMUM; break; + case Intrinsic::minimumnum: + ISD = ISD::FMINIMUMNUM; + break; + case Intrinsic::maximumnum: + ISD = ISD::FMAXIMUMNUM; + break; case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break; diff --git a/llvm/include/llvm/CodeGen/GlobalMerge.h b/llvm/include/llvm/CodeGen/GlobalMerge.h index 13ad67d4544bc7..1577bcf8903f52 100644 --- a/llvm/include/llvm/CodeGen/GlobalMerge.h +++ b/llvm/include/llvm/CodeGen/GlobalMerge.h @@ -28,6 +28,8 @@ struct GlobalMergeOptions { bool MergeConst = false; /// Whether we should merge global variables that have external linkage. bool MergeExternal = true; + /// Whether we should merge constant global variables. + bool MergeConstantGlobals = false; /// Whether we should try to optimize for size only. /// Currently, this applies a dead simple heuristic: only consider globals /// used in minsize functions for merging. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 7305e3086fcd65..b8f8818a749528 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1047,6 +1047,11 @@ enum NodeType { FMINIMUM, FMAXIMUM, + /// FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with + /// FMINNUM_IEEE and FMAXNUM_IEEE besides if either operand is sNaN. + FMINIMUMNUM, + FMAXIMUMNUM, + /// FSINCOS - Compute both fsin and fcos as a single operation. FSINCOS, diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index 20273d069bf053..c7c2178571215b 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -476,7 +476,8 @@ namespace llvm { /// Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset, bool OnlyOptimizeForSize = false, - bool MergeExternalByDefault = false); + bool MergeExternalByDefault = false, + bool MergeConstantByDefault = false); /// This pass splits the stack into a safe stack and an unsafe stack to /// protect against stack-based overflow vulnerabilities. diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 96ece1559bc437..88ddd43a2a8913 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -737,6 +737,14 @@ template inline UnaryOpc_match m_VScale(const Opnd &Op) { return UnaryOpc_match(ISD::VSCALE, Op); } +template inline UnaryOpc_match m_FPToUI(const Opnd &Op) { + return UnaryOpc_match(ISD::FP_TO_UINT, Op); +} + +template inline UnaryOpc_match m_FPToSI(const Opnd &Op) { + return UnaryOpc_match(ISD::FP_TO_SINT, Op); +} + // === Constants === struct ConstantInt_match { APInt *BindVal; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index deb1d04df3400c..eda38cd8a564d6 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2908,6 +2908,8 @@ class TargetLoweringBase { case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: case ISD::AVGFLOORS: case ISD::AVGFLOORU: case ISD::AVGCEILS: @@ -5283,6 +5285,9 @@ class TargetLowering : public TargetLoweringBase { /// Expand fminimum/fmaximum into multiple comparison with selects. SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const; + /// Expand fminimumnum/fmaximumnum into multiple comparison with selects. + SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const; + /// Expand FP_TO_[US]INT_SAT into FP_TO_[US]INT and selects or min/max. /// \param N Node to expand /// \returns The expansion result diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index 795cd05ea5b5e2..1185939cd9c75b 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -49,51 +49,11 @@ class StructLayout; class Triple; class Value; -/// Enum used to categorize the alignment types stored by LayoutAlignElem -enum AlignTypeEnum { - INTEGER_ALIGN = 'i', - VECTOR_ALIGN = 'v', - FLOAT_ALIGN = 'f', - AGGREGATE_ALIGN = 'a' -}; - // FIXME: Currently the DataLayout string carries a "preferred alignment" // for types. As the DataLayout is module/global, this should likely be // sunk down to an FTTI element that is queried rather than a global // preference. -/// Layout alignment element. -/// -/// Stores the alignment data associated with a given type bit width. -struct LayoutAlignElem { - uint32_t TypeBitWidth; - Align ABIAlign; - Align PrefAlign; - - static LayoutAlignElem get(Align ABIAlign, Align PrefAlign, - uint32_t BitWidth); - - bool operator==(const LayoutAlignElem &rhs) const; -}; - -/// Layout pointer alignment element. -/// -/// Stores the alignment data associated with a given pointer and address space. -struct PointerAlignElem { - uint32_t AddressSpace; - uint32_t TypeBitWidth; - Align ABIAlign; - Align PrefAlign; - uint32_t IndexBitWidth; - - /// Initializer - static PointerAlignElem getInBits(uint32_t AddressSpace, Align ABIAlign, - Align PrefAlign, uint32_t TypeBitWidth, - uint32_t IndexBitWidth); - - bool operator==(const PointerAlignElem &rhs) const; -}; - /// A parsed version of the target data layout string in and methods for /// querying it. /// @@ -102,6 +62,26 @@ struct PointerAlignElem { /// target being codegen'd to. class DataLayout { public: + /// Primitive type specification. + struct PrimitiveSpec { + uint32_t BitWidth; + Align ABIAlign; + Align PrefAlign; + + bool operator==(const PrimitiveSpec &Other) const; + }; + + /// Pointer type specification. + struct PointerSpec { + uint32_t AddrSpace; + uint32_t BitWidth; + Align ABIAlign; + Align PrefAlign; + uint32_t IndexBitWidth; + + bool operator==(const PointerSpec &Other) const; + }; + enum class FunctionPtrAlignType { /// The function pointer alignment is independent of the function alignment. Independent, @@ -135,20 +115,26 @@ class DataLayout { // FIXME: `unsigned char` truncates the value parsed by `parseSpecifier`. SmallVector LegalIntWidths; - // Primitive type specifications. Sorted and uniqued by type bit width. - SmallVector IntAlignments; - SmallVector FloatAlignments; - SmallVector VectorAlignments; + /// Type specifier used by some internal functions. + enum class TypeSpecifier { + Integer = 'i', + Float = 'f', + Vector = 'v', + Aggregate = 'a' + }; - // Pointer type specifications. Sorted and uniqued by address space number. - SmallVector Pointers; + /// Primitive type specifications. Sorted and uniqued by type bit width. + SmallVector IntSpecs; + SmallVector FloatSpecs; + SmallVector VectorSpecs; + + /// Pointer type specifications. Sorted and uniqued by address space number. + SmallVector PointerSpecs; /// The string representation used to create this DataLayout std::string StringRepresentation; - const PointerAlignElem &getPointerAlignElem(uint32_t AddressSpace) const; - - // Struct type ABI and preferred alignments. The default spec is "a:8:64". + /// Struct type ABI and preferred alignments. The default spec is "a:8:64". Align StructABIAlignment = Align::Constant<1>(); Align StructPrefAlignment = Align::Constant<8>(); @@ -159,16 +145,19 @@ class DataLayout { /// well-defined bitwise representation. SmallVector NonIntegralAddressSpaces; - /// Attempts to set the alignment of the given type. Returns an error - /// description on failure. - Error setAlignment(AlignTypeEnum AlignType, Align ABIAlign, Align PrefAlign, - uint32_t BitWidth); + /// Attempts to set the specification for the given type. + /// Returns an error description on failure. + Error setPrimitiveSpec(TypeSpecifier Specifier, uint32_t BitWidth, + Align ABIAlign, Align PrefAlign); + + /// Searches for a pointer specification that matches the given address space. + /// Returns the default address space specification if not found. + const PointerSpec &getPointerSpec(uint32_t AddrSpace) const; - /// Attempts to set the alignment of a pointer in the given address space. + /// Attempts to set the specification for pointer in the given address space. /// Returns an error description on failure. - Error setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign, - Align PrefAlign, uint32_t TypeBitWidth, - uint32_t IndexBitWidth); + Error setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, Align ABIAlign, + Align PrefAlign, uint32_t IndexBitWidth); /// Internal helper to get alignment for integer of given bitwidth. Align getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const; @@ -375,7 +364,7 @@ class DataLayout { /// FIXME: The defaults need to be removed once all of /// the backends/clients are updated. unsigned getPointerSizeInBits(unsigned AS = 0) const { - return getPointerAlignElem(AS).TypeBitWidth; + return getPointerSpec(AS).BitWidth; } /// Returns the maximum index size over all address spaces. @@ -385,7 +374,7 @@ class DataLayout { /// Size in bits of index used for address calculation in getelementptr. unsigned getIndexSizeInBits(unsigned AS) const { - return getPointerAlignElem(AS).IndexBitWidth; + return getPointerSpec(AS).IndexBitWidth; } /// Layout pointer size, in bits, based on the type. If this function is diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 3f3d75012c6945..0dbcbc0b2cb76f 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1015,6 +1015,18 @@ class IRBuilderBase { return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name); } + /// Create call to the minimumnum intrinsic. + Value *CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name = "") { + return CreateBinaryIntrinsic(Intrinsic::minimumnum, LHS, RHS, nullptr, + Name); + } + + /// Create call to the maximum intrinsic. + Value *CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name = "") { + return CreateBinaryIntrinsic(Intrinsic::maximumnum, LHS, RHS, nullptr, + Name); + } + /// Create call to the copysign intrinsic. Value *CreateCopySign(Value *LHS, Value *RHS, Instruction *FMFSource = nullptr, diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 94c8fa092f45e6..2f1e2c08c3ecec 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -76,6 +76,8 @@ class IntrinsicInst : public CallInst { case Intrinsic::minnum: case Intrinsic::maximum: case Intrinsic::minimum: + case Intrinsic::maximumnum: + case Intrinsic::minimumnum: case Intrinsic::smax: case Intrinsic::smin: case Intrinsic::umax: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index b4e758136b39fb..0841273fd2e1e5 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1085,6 +1085,14 @@ def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] >; +def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] +>; +def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] +>; // Internal interface for object size checking def int_objectsize : DefaultAttrsIntrinsic<[llvm_anyint_ty], diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 904801e6e9e95f..c9102aa3dd972b 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -20,6 +20,16 @@ def int_dx_flattened_thread_id_in_group : Intrinsic<[llvm_i32_ty], [], [IntrNoMe def int_dx_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">, Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>; +// Create resource handle given binding information. Returns a `target("dx.")` +// type appropriate for the kind of resource given a register space ID, lower +// bound and range size of the binding, as well as an index and an indicator +// whether that index may be non-uniform. +def int_dx_handle_fromBinding + : DefaultAttrsIntrinsic< + [llvm_any_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], + [IntrNoMem]>; + def int_dx_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..65a3d2d0f943a7 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1052,6 +1052,22 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], [IntrNoMem, IntrSpeculatable, Commutative]>; +// +// Dot Product +// + foreach a_type = ["s", "u"] in { + foreach b_type = ["s", "u"] in { + def int_nvvm_idp4a_ # a_type # _ # b_type : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_idp2a_ # a_type # _ # b_type : + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, ImmArg>]>; + } + } + // // Convert // diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index 89aaf6d1ad83f8..c3d5ef9f4e4f82 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -299,6 +299,16 @@ HANDLE_LIBCALL(FMAX_F64, "fmax") HANDLE_LIBCALL(FMAX_F80, "fmaxl") HANDLE_LIBCALL(FMAX_F128, "fmaxl") HANDLE_LIBCALL(FMAX_PPCF128, "fmaxl") +HANDLE_LIBCALL(FMINIMUMNUM_F32, "fminimum_numf") +HANDLE_LIBCALL(FMINIMUMNUM_F64, "fminimum_num") +HANDLE_LIBCALL(FMINIMUMNUM_F80, "fminimum_numl") +HANDLE_LIBCALL(FMINIMUMNUM_F128, "fminmum_numl") +HANDLE_LIBCALL(FMINIMUMNUM_PPCF128, "fminimum_numl") +HANDLE_LIBCALL(FMAXIMUMNUM_F32, "fmaximum_numf") +HANDLE_LIBCALL(FMAXIMUMNUM_F64, "fmaximum_num") +HANDLE_LIBCALL(FMAXIMUMNUM_F80, "fmaximum_numl") +HANDLE_LIBCALL(FMAXIMUMNUM_F128, "fmaxmum_numl") +HANDLE_LIBCALL(FMAXIMUMNUM_PPCF128, "fmaximum_numl") HANDLE_LIBCALL(LROUND_F32, "lroundf") HANDLE_LIBCALL(LROUND_F64, "lround") HANDLE_LIBCALL(LROUND_F80, "lroundl") diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 261912aab3076c..0cd0d2edc23668 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -84,6 +84,7 @@ void initializeDAHPass(PassRegistry&); void initializeDCELegacyPassPass(PassRegistry&); void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &); void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &); +void initializeDXILResourceWrapperPassPass(PassRegistry &); void initializeDeadMachineInstructionElimPass(PassRegistry&); void initializeDebugifyMachineModulePass(PassRegistry &); void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index c00e425b131987..967238e8d03b6e 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallPrinter.h" +#include "llvm/Analysis/DXILResource.h" #include "llvm/Analysis/DomPrinter.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Passes.h" @@ -69,6 +70,7 @@ namespace { (void) llvm::createCallGraphViewerPass(); (void) llvm::createCFGSimplificationPass(); (void) llvm::createStructurizeCFGPass(); + (void) llvm::createDXILResourceWrapperPassPass(); (void) llvm::createDeadArgEliminationPass(); (void) llvm::createDeadCodeEliminationPass(); (void) llvm::createDependenceAnalysisWrapperPass(); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 3cc39b54ba758d..eb15beb835b535 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -27,6 +27,8 @@ #include "llvm/CodeGen/CodeGenPrepare.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/DwarfEHPrepare.h" +#include "llvm/CodeGen/ExpandLargeDivRem.h" +#include "llvm/CodeGen/ExpandLargeFpConvert.h" #include "llvm/CodeGen/ExpandMemCmp.h" #include "llvm/CodeGen/ExpandReductions.h" #include "llvm/CodeGen/FinalizeISel.h" @@ -627,6 +629,8 @@ void CodeGenPassBuilder::addISelPasses( addPass(LowerEmuTLSPass()); addPass(PreISelIntrinsicLoweringPass(&TM)); + addPass(ExpandLargeDivRemPass(&TM)); + addPass(ExpandLargeFpConvertPass(&TM)); derived().addIRPasses(addPass); derived().addCodeGenPrepare(addPass); diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index a6adb448ff0b19..a2e2a32e9c01eb 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -682,6 +682,98 @@ class Instruction : public sandboxir::User { /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From); + /// Determine whether the no signed wrap flag is set. + bool hasNoUnsignedWrap() const { + return cast(Val)->hasNoUnsignedWrap(); + } + /// Set or clear the nuw flag on this instruction, which must be an operator + /// which supports this flag. See LangRef.html for the meaning of this flag. + void setHasNoUnsignedWrap(bool B = true); + /// Determine whether the no signed wrap flag is set. + bool hasNoSignedWrap() const { + return cast(Val)->hasNoSignedWrap(); + } + /// Set or clear the nsw flag on this instruction, which must be an operator + /// which supports this flag. See LangRef.html for the meaning of this flag. + void setHasNoSignedWrap(bool B = true); + /// Determine whether all fast-math-flags are set. + bool isFast() const { return cast(Val)->isFast(); } + /// Set or clear all fast-math-flags on this instruction, which must be an + /// operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setFast(bool B); + /// Determine whether the allow-reassociation flag is set. + bool hasAllowReassoc() const { + return cast(Val)->hasAllowReassoc(); + } + /// Set or clear the reassociation flag on this instruction, which must be + /// an operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasAllowReassoc(bool B); + /// Determine whether the exact flag is set. + bool isExact() const { return cast(Val)->isExact(); } + /// Set or clear the exact flag on this instruction, which must be an operator + /// which supports this flag. See LangRef.html for the meaning of this flag. + void setIsExact(bool B = true); + /// Determine whether the no-NaNs flag is set. + bool hasNoNaNs() const { return cast(Val)->hasNoNaNs(); } + /// Set or clear the no-nans flag on this instruction, which must be an + /// operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasNoNaNs(bool B); + /// Determine whether the no-infs flag is set. + bool hasNoInfs() const { return cast(Val)->hasNoInfs(); } + /// Set or clear the no-infs flag on this instruction, which must be an + /// operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasNoInfs(bool B); + /// Determine whether the no-signed-zeros flag is set. + bool hasNoSignedZeros() const { + return cast(Val)->hasNoSignedZeros(); + } + /// Set or clear the no-signed-zeros flag on this instruction, which must be + /// an operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasNoSignedZeros(bool B); + /// Determine whether the allow-reciprocal flag is set. + bool hasAllowReciprocal() const { + return cast(Val)->hasAllowReciprocal(); + } + /// Set or clear the allow-reciprocal flag on this instruction, which must be + /// an operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasAllowReciprocal(bool B); + /// Determine whether the allow-contract flag is set. + bool hasAllowContract() const { + return cast(Val)->hasAllowContract(); + } + /// Set or clear the allow-contract flag on this instruction, which must be + /// an operator which supports this flag. See LangRef.html for the meaning of + /// this flag. + void setHasAllowContract(bool B); + /// Determine whether the approximate-math-functions flag is set. + bool hasApproxFunc() const { + return cast(Val)->hasApproxFunc(); + } + /// Set or clear the approximate-math-functions flag on this instruction, + /// which must be an operator which supports this flag. See LangRef.html for + /// the meaning of this flag. + void setHasApproxFunc(bool B); + /// Convenience function for getting all the fast-math flags, which must be an + /// operator which supports these flags. See LangRef.html for the meaning of + /// these flags. + FastMathFlags getFastMathFlags() const { + return cast(Val)->getFastMathFlags(); + } + /// Convenience function for setting multiple fast-math flags on this + /// instruction, which must be an operator which supports these flags. See + /// LangRef.html for the meaning of these flags. + void setFastMathFlags(FastMathFlags FMF); + /// Convenience function for transferring all fast-math flag values to this + /// instruction, which must be an operator which supports these flags. See + /// LangRef.html for the meaning of these flags. + void copyFastMathFlags(FastMathFlags FMF); + #ifndef NDEBUG void dumpOS(raw_ostream &OS) const override; #endif diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index 9c9ce89ba87252..80efbf9e8f6e08 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -226,13 +226,13 @@ class RemoveFromParent : public IRChangeBase { /// template class GenericSetter final : public IRChangeBase { - /// Helper for getting the class type from the getter - template - static ClassT getClassTypeFromGetter(RetT (ClassT::*Fn)() const); - template - static ClassT getClassTypeFromGetter(RetT (ClassT::*Fn)()); - - using InstrT = decltype(getClassTypeFromGetter(GetterFn)); + /// Traits for getting the class type from GetterFn type. + template struct GetClassTypeFromGetter; + template + struct GetClassTypeFromGetter { + using ClassType = ClassT; + }; + using InstrT = typename GetClassTypeFromGetter::ClassType; using SavedValT = std::invoke_result_t; InstrT *I; SavedValT OrigVal; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index e40ad2062166ea..172deffbd31771 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -517,6 +517,10 @@ def fminimum : SDNode<"ISD::FMINIMUM" , SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def fmaximum : SDNode<"ISD::FMAXIMUM" , SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def fminimumnum : SDNode<"ISD::FMINIMUMNUM" , SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def fmaximumnum : SDNode<"ISD::FMAXIMUMNUM" , SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; def fgetsign : SDNode<"ISD::FGETSIGN" , SDTFPToIntOp>; def fcanonicalize : SDNode<"ISD::FCANONICALIZE", SDTFPUnaryOp>; def fneg : SDNode<"ISD::FNEG" , SDTFPUnaryOp>; diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index ad3c6426efd2fe..718cf704cbdf1a 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1448,7 +1448,7 @@ struct AttributorConfig { /// Callback function to determine if an indirect call targets should be made /// direct call targets (with an if-cascade). std::function + Function &AssumedCallee, unsigned NumAssumedCallees)> IndirectCalleeSpecializationCallback = nullptr; /// Helper to update an underlying call graph and to delete functions. @@ -1718,10 +1718,11 @@ struct Attributor { /// Return true if we should specialize the call site \b CB for the potential /// callee \p Fn. bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &AA, - CallBase &CB, Function &Callee) { + CallBase &CB, Function &Callee, + unsigned NumAssumedCallees) { return Configuration.IndirectCalleeSpecializationCallback - ? Configuration.IndirectCalleeSpecializationCallback(*this, AA, - CB, Callee) + ? Configuration.IndirectCalleeSpecializationCallback( + *this, AA, CB, Callee, NumAssumedCallees) : true; } diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index 9d564a3279ce77..24b6354662955e 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -188,13 +188,7 @@ struct LoopVectorizePass : public PassInfoMixin { function_ref MapClassName2PassName); // Shim for old PM. - LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, - TargetTransformInfo &TTI_, DominatorTree &DT_, - BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, - LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, - ProfileSummaryInfo *PSI_); + LoopVectorizeResult runImpl(Function &F); bool processLoop(Loop *L); }; diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 95531544a1c817..809beadb5f7df3 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -56,9 +56,9 @@ class BoUpSLP; } // end namespace slpvectorizer struct SLPVectorizerPass : public PassInfoMixin { - using StoreList = SmallVector; + using StoreList = SmallVector; using StoreListMap = MapVector; - using GEPList = SmallVector; + using GEPList = SmallVector; using GEPListMap = MapVector; using InstSetVector = SmallSetVector; diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index 11cc6cfccea6af..58723469f21ca8 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -25,6 +25,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeCallGraphDOTPrinterPass(Registry); initializeCallGraphViewerPass(Registry); initializeCycleInfoWrapperPassPass(Registry); + initializeDXILResourceWrapperPassPass(Registry); initializeDependenceAnalysisWrapperPassPass(Registry); initializeDominanceFrontierWrapperPassPass(Registry); initializeDomViewerWrapperPassPass(Registry); diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index fbae705127538a..7b4666b29a1936 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Analysis.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/ProfileData/PGOCtxProfReader.h" @@ -64,10 +65,39 @@ Value toJSON(const PGOCtxProfContext::CallTargetMapTy &P) { } // namespace json } // namespace llvm +const char *AssignGUIDPass::GUIDMetadataName = "guid"; + +PreservedAnalyses AssignGUIDPass::run(Module &M, ModuleAnalysisManager &MAM) { + for (auto &F : M.functions()) { + if (F.isDeclaration()) + continue; + if (F.getMetadata(GUIDMetadataName)) + continue; + const GlobalValue::GUID GUID = F.getGUID(); + F.setMetadata(GUIDMetadataName, + MDNode::get(M.getContext(), + {ConstantAsMetadata::get(ConstantInt::get( + Type::getInt64Ty(M.getContext()), GUID))})); + } + return PreservedAnalyses::none(); +} + +GlobalValue::GUID AssignGUIDPass::getGUID(const Function &F) { + if (F.isDeclaration()) { + assert(GlobalValue::isExternalLinkage(F.getLinkage())); + return GlobalValue::getGUID(F.getGlobalIdentifier()); + } + auto *MD = F.getMetadata(GUIDMetadataName); + assert(MD && "guid not found for defined function"); + return cast(cast(MD->getOperand(0)) + ->getValue() + ->stripPointerCasts()) + ->getZExtValue(); +} AnalysisKey CtxProfAnalysis::Key; -CtxProfAnalysis::Result CtxProfAnalysis::run(Module &M, - ModuleAnalysisManager &MAM) { +PGOContextualProfile CtxProfAnalysis::run(Module &M, + ModuleAnalysisManager &MAM) { ErrorOr> MB = MemoryBuffer::getFile(Profile); if (auto EC = MB.getError()) { M.getContext().emitError("could not open contextual profile file: " + @@ -81,7 +111,55 @@ CtxProfAnalysis::Result CtxProfAnalysis::run(Module &M, toString(MaybeCtx.takeError())); return {}; } - return Result(std::move(*MaybeCtx)); + + PGOContextualProfile Result; + + for (const auto &F : M) { + if (F.isDeclaration()) + continue; + auto GUID = AssignGUIDPass::getGUID(F); + assert(GUID && "guid not found for defined function"); + const auto &Entry = F.begin(); + uint32_t MaxCounters = 0; // we expect at least a counter. + for (const auto &I : *Entry) + if (auto *C = dyn_cast(&I)) { + MaxCounters = + static_cast(C->getNumCounters()->getZExtValue()); + break; + } + if (!MaxCounters) + continue; + uint32_t MaxCallsites = 0; + for (const auto &BB : F) + for (const auto &I : BB) + if (auto *C = dyn_cast(&I)) { + MaxCallsites = + static_cast(C->getNumCounters()->getZExtValue()); + break; + } + auto [It, Ins] = Result.FuncInfo.insert( + {GUID, PGOContextualProfile::FunctionInfo(F.getName())}); + (void)Ins; + assert(Ins); + It->second.NextCallsiteIndex = MaxCallsites; + It->second.NextCounterIndex = MaxCounters; + } + // If we made it this far, the Result is valid - which we mark by setting + // .Profiles. + // Trim first the roots that aren't in this module. + DenseSet ProfiledGUIDs; + for (auto &[RootGuid, _] : llvm::make_early_inc_range(*MaybeCtx)) + if (!Result.FuncInfo.contains(RootGuid)) + MaybeCtx->erase(RootGuid); + Result.Profiles = std::move(*MaybeCtx); + return Result; +} + +GlobalValue::GUID +PGOContextualProfile::getDefinedFunctionGUID(const Function &F) const { + if (auto It = FuncInfo.find(AssignGUIDPass::getGUID(F)); It != FuncInfo.end()) + return It->first; + return 0; } PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M, @@ -91,8 +169,16 @@ PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M, M.getContext().emitError("Invalid CtxProfAnalysis"); return PreservedAnalyses::all(); } + + OS << "Function Info:\n"; + for (const auto &[Guid, FuncInfo] : C.FuncInfo) + OS << Guid << " : " << FuncInfo.Name + << ". MaxCounterID: " << FuncInfo.NextCounterIndex + << ". MaxCallsiteID: " << FuncInfo.NextCallsiteIndex << "\n"; + const auto JSONed = ::llvm::json::toJSON(C.profiles()); + OS << "\nCurrent Profile:\n"; OS << formatv("{0:2}", JSONed); OS << "\n"; return PreservedAnalyses::all(); diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 5e8350fd2d2516..d7d10ec4d8d0d6 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -8,12 +8,146 @@ #include "llvm/Analysis/DXILResource.h" #include "llvm/ADT/APInt.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "dxil-resource" using namespace llvm; using namespace dxil; +static constexpr StringRef getResourceClassName(ResourceClass RC) { + switch (RC) { + case ResourceClass::SRV: + return "SRV"; + case ResourceClass::UAV: + return "UAV"; + case ResourceClass::CBuffer: + return "CBuffer"; + case ResourceClass::Sampler: + return "Sampler"; + } + llvm_unreachable("Unhandled ResourceClass"); +} + +static constexpr StringRef getResourceKindName(ResourceKind RK) { + switch (RK) { + case ResourceKind::Texture1D: + return "Texture1D"; + case ResourceKind::Texture2D: + return "Texture2D"; + case ResourceKind::Texture2DMS: + return "Texture2DMS"; + case ResourceKind::Texture3D: + return "Texture3D"; + case ResourceKind::TextureCube: + return "TextureCube"; + case ResourceKind::Texture1DArray: + return "Texture1DArray"; + case ResourceKind::Texture2DArray: + return "Texture2DArray"; + case ResourceKind::Texture2DMSArray: + return "Texture2DMSArray"; + case ResourceKind::TextureCubeArray: + return "TextureCubeArray"; + case ResourceKind::TypedBuffer: + return "TypedBuffer"; + case ResourceKind::RawBuffer: + return "RawBuffer"; + case ResourceKind::StructuredBuffer: + return "StructuredBuffer"; + case ResourceKind::CBuffer: + return "CBuffer"; + case ResourceKind::Sampler: + return "Sampler"; + case ResourceKind::TBuffer: + return "TBuffer"; + case ResourceKind::RTAccelerationStructure: + return "RTAccelerationStructure"; + case ResourceKind::FeedbackTexture2D: + return "FeedbackTexture2D"; + case ResourceKind::FeedbackTexture2DArray: + return "FeedbackTexture2DArray"; + case ResourceKind::NumEntries: + case ResourceKind::Invalid: + return ""; + } + llvm_unreachable("Unhandled ResourceKind"); +} + +static constexpr StringRef getElementTypeName(ElementType ET) { + switch (ET) { + case ElementType::I1: + return "i1"; + case ElementType::I16: + return "i16"; + case ElementType::U16: + return "u16"; + case ElementType::I32: + return "i32"; + case ElementType::U32: + return "u32"; + case ElementType::I64: + return "i64"; + case ElementType::U64: + return "u64"; + case ElementType::F16: + return "f16"; + case ElementType::F32: + return "f32"; + case ElementType::F64: + return "f64"; + case ElementType::SNormF16: + return "snorm_f16"; + case ElementType::UNormF16: + return "unorm_f16"; + case ElementType::SNormF32: + return "snorm_f32"; + case ElementType::UNormF32: + return "unorm_f32"; + case ElementType::SNormF64: + return "snorm_f64"; + case ElementType::UNormF64: + return "unorm_f64"; + case ElementType::PackedS8x32: + return "p32i8"; + case ElementType::PackedU8x32: + return "p32u8"; + case ElementType::Invalid: + return ""; + } + llvm_unreachable("Unhandled ElementType"); +} + +static constexpr StringRef getSamplerTypeName(SamplerType ST) { + switch (ST) { + case SamplerType::Default: + return "Default"; + case SamplerType::Comparison: + return "Comparison"; + case SamplerType::Mono: + return "Mono"; + } + llvm_unreachable("Unhandled SamplerType"); +} + +static constexpr StringRef getSamplerFeedbackTypeName(SamplerFeedbackType SFT) { + switch (SFT) { + case SamplerFeedbackType::MinMip: + return "MinMip"; + case SamplerFeedbackType::MipRegionUsed: + return "MipRegionUsed"; + } + llvm_unreachable("Unhandled SamplerFeedbackType"); +} + bool ResourceInfo::isUAV() const { return RC == ResourceClass::UAV; } bool ResourceInfo::isCBuffer() const { return RC == ResourceClass::CBuffer; } @@ -236,7 +370,7 @@ MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const { Constant::getIntegerValue(I1Ty, APInt(1, V))); }; - MDVals.push_back(getIntMD(Binding.UniqueID)); + MDVals.push_back(getIntMD(Binding.RecordID)); MDVals.push_back(ValueAsMetadata::get(Symbol)); MDVals.push_back(MDString::get(Ctx, Name)); MDVals.push_back(getIntMD(Binding.Space)); @@ -326,4 +460,321 @@ std::pair ResourceInfo::getAnnotateProps() const { return {Word0, Word1}; } -#define DEBUG_TYPE "dxil-resource" +void ResourceInfo::print(raw_ostream &OS) const { + OS << " Symbol: "; + Symbol->printAsOperand(OS); + OS << "\n"; + + OS << " Name: \"" << Name << "\"\n" + << " Binding:\n" + << " Record ID: " << Binding.RecordID << "\n" + << " Space: " << Binding.Space << "\n" + << " Lower Bound: " << Binding.LowerBound << "\n" + << " Size: " << Binding.Size << "\n" + << " Class: " << getResourceClassName(RC) << "\n" + << " Kind: " << getResourceKindName(Kind) << "\n"; + + if (isCBuffer()) { + OS << " CBuffer size: " << CBufferSize << "\n"; + } else if (isSampler()) { + OS << " Sampler Type: " << getSamplerTypeName(SamplerTy) << "\n"; + } else { + if (isUAV()) { + OS << " Globally Coherent: " << UAVFlags.GloballyCoherent << "\n" + << " HasCounter: " << UAVFlags.HasCounter << "\n" + << " IsROV: " << UAVFlags.IsROV << "\n"; + } + if (isMultiSample()) + OS << " Sample Count: " << MultiSample.Count << "\n"; + + if (isStruct()) { + OS << " Buffer Stride: " << Struct.Stride << "\n"; + OS << " Alignment: " << Struct.AlignLog2 << "\n"; + } else if (isTyped()) { + OS << " Element Type: " << getElementTypeName(Typed.ElementTy) << "\n" + << " Element Count: " << Typed.ElementCount << "\n"; + } else if (isFeedback()) + OS << " Feedback Type: " << getSamplerFeedbackTypeName(Feedback.Type) + << "\n"; + } +} + +//===----------------------------------------------------------------------===// +// ResourceMapper + +static dxil::ElementType toDXILElementType(Type *Ty, bool IsSigned) { + // TODO: Handle unorm, snorm, and packed. + Ty = Ty->getScalarType(); + + if (Ty->isIntegerTy()) { + switch (Ty->getIntegerBitWidth()) { + case 16: + return IsSigned ? ElementType::I16 : ElementType::U16; + case 32: + return IsSigned ? ElementType::I32 : ElementType::U32; + case 64: + return IsSigned ? ElementType::I64 : ElementType::U64; + case 1: + default: + return ElementType::Invalid; + } + } else if (Ty->isFloatTy()) { + return ElementType::F32; + } else if (Ty->isDoubleTy()) { + return ElementType::F64; + } else if (Ty->isHalfTy()) { + return ElementType::F16; + } + + return ElementType::Invalid; +} + +namespace { + +class ResourceMapper { + Module &M; + LLVMContext &Context; + DXILResourceMap &Resources; + + // In DXC, Record ID is unique per resource type. Match that. + uint32_t NextUAV = 0; + uint32_t NextSRV = 0; + uint32_t NextCBuf = 0; + uint32_t NextSmp = 0; + +public: + ResourceMapper(Module &M, + MapVector &Resources) + : M(M), Context(M.getContext()), Resources(Resources) {} + + void diagnoseHandle(CallInst *CI, const Twine &Msg, + DiagnosticSeverity Severity = DS_Error) { + std::string S; + raw_string_ostream SS(S); + CI->printAsOperand(SS); + DiagnosticInfoUnsupported Diag(*CI->getFunction(), Msg + ": " + SS.str(), + CI->getDebugLoc(), Severity); + Context.diagnose(Diag); + } + + ResourceInfo *mapBufferType(CallInst *CI, TargetExtType *HandleTy, + bool IsTyped) { + if (HandleTy->getNumTypeParameters() != 1 || + HandleTy->getNumIntParameters() != (IsTyped ? 3 : 2)) { + diagnoseHandle(CI, Twine("Invalid buffer target type")); + return nullptr; + } + + Type *ElTy = HandleTy->getTypeParameter(0); + unsigned IsWriteable = HandleTy->getIntParameter(0); + unsigned IsROV = HandleTy->getIntParameter(1); + bool IsSigned = IsTyped && HandleTy->getIntParameter(2); + + ResourceClass RC = IsWriteable ? ResourceClass::UAV : ResourceClass::SRV; + ResourceKind Kind; + if (IsTyped) + Kind = ResourceKind::TypedBuffer; + else if (ElTy->isIntegerTy(8)) + Kind = ResourceKind::RawBuffer; + else + Kind = ResourceKind::StructuredBuffer; + + // TODO: We need to lower to a typed pointer, can we smuggle the type + // through? + Value *Symbol = UndefValue::get(PointerType::getUnqual(Context)); + // TODO: We don't actually keep track of the name right now... + StringRef Name = ""; + + auto [It, Success] = Resources.try_emplace(CI, RC, Kind, Symbol, Name); + assert(Success && "Mapping the same CallInst again?"); + (void)Success; + // We grab a pointer into the map's storage, which isn't generally safe. + // Since we're just using this to fill in the info the map won't mutate and + // the pointer stays valid for as long as we need it to. + ResourceInfo *RI = &(It->second); + + if (RI->isUAV()) + // TODO: We need analysis for GloballyCoherent and HasCounter + RI->setUAV(false, false, IsROV); + + if (RI->isTyped()) { + dxil::ElementType ET = toDXILElementType(ElTy, IsSigned); + uint32_t Count = 1; + if (auto *VTy = dyn_cast(ElTy)) + Count = VTy->getNumElements(); + RI->setTyped(ET, Count); + } else if (RI->isStruct()) { + const DataLayout &DL = M.getDataLayout(); + + // This mimics what DXC does. Notably, we only ever set the alignment if + // the type is actually a struct type. + uint32_t Stride = DL.getTypeAllocSize(ElTy); + MaybeAlign Alignment; + if (auto *STy = dyn_cast(ElTy)) + Alignment = DL.getStructLayout(STy)->getAlignment(); + RI->setStruct(Stride, Alignment); + } + + return RI; + } + + ResourceInfo *mapHandleIntrin(CallInst *CI) { + FunctionType *FTy = CI->getFunctionType(); + Type *RetTy = FTy->getReturnType(); + auto *HandleTy = dyn_cast(RetTy); + if (!HandleTy) { + diagnoseHandle(CI, "dx.handle.fromBinding requires target type"); + return nullptr; + } + + StringRef TypeName = HandleTy->getName(); + if (TypeName == "dx.TypedBuffer") { + return mapBufferType(CI, HandleTy, /*IsTyped=*/true); + } else if (TypeName == "dx.RawBuffer") { + return mapBufferType(CI, HandleTy, /*IsTyped=*/false); + } else if (TypeName == "dx.CBuffer") { + // TODO: implement + diagnoseHandle(CI, "dx.CBuffer handles are not implemented yet"); + return nullptr; + } else if (TypeName == "dx.Sampler") { + // TODO: implement + diagnoseHandle(CI, "dx.Sampler handles are not implemented yet"); + return nullptr; + } else if (TypeName == "dx.Texture") { + // TODO: implement + diagnoseHandle(CI, "dx.Texture handles are not implemented yet"); + return nullptr; + } + + diagnoseHandle(CI, "Invalid target(dx) type"); + return nullptr; + } + + ResourceInfo *mapHandleFromBinding(CallInst *CI) { + assert(CI->getIntrinsicID() == Intrinsic::dx_handle_fromBinding && + "Must be dx.handle.fromBinding intrinsic"); + + ResourceInfo *RI = mapHandleIntrin(CI); + if (!RI) + return nullptr; + + uint32_t NextID; + if (RI->isCBuffer()) + NextID = NextCBuf++; + else if (RI->isSampler()) + NextID = NextSmp++; + else if (RI->isUAV()) + NextID = NextUAV++; + else + NextID = NextSRV++; + + uint32_t Space = cast(CI->getArgOperand(0))->getZExtValue(); + uint32_t LowerBound = + cast(CI->getArgOperand(1))->getZExtValue(); + uint32_t Size = cast(CI->getArgOperand(2))->getZExtValue(); + + RI->bind(NextID, Space, LowerBound, Size); + + return RI; + } + + void mapResources() { + for (Function &F : M.functions()) { + if (!F.isDeclaration()) + continue; + LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n"); + Intrinsic::ID ID = F.getIntrinsicID(); + switch (ID) { + default: + // TODO: handle `dx.op` functions. + continue; + case Intrinsic::dx_handle_fromBinding: + for (User *U : F.users()) { + LLVM_DEBUG(dbgs() << " Visiting: " << *U << "\n"); + if (CallInst *CI = dyn_cast(U)) + mapHandleFromBinding(CI); + } + break; + } + } + } +}; + +} // namespace + +//===----------------------------------------------------------------------===// +// DXILResourceAnalysis and DXILResourcePrinterPass + +// Provide an explicit template instantiation for the static ID. +AnalysisKey DXILResourceAnalysis::Key; + +DXILResourceMap DXILResourceAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { + DXILResourceMap Data; + ResourceMapper(M, Data).mapResources(); + return Data; +} + +PreservedAnalyses DXILResourcePrinterPass::run(Module &M, + ModuleAnalysisManager &AM) { + DXILResourceMap &Data = + AM.getResult(M); + + for (const auto &[Handle, Info] : Data) { + OS << "Binding for "; + Handle->print(OS); + OS << "\n"; + Info.print(OS); + OS << "\n"; + } + + return PreservedAnalyses::all(); +} + +//===----------------------------------------------------------------------===// +// DXILResourceWrapperPass + +DXILResourceWrapperPass::DXILResourceWrapperPass() : ModulePass(ID) { + initializeDXILResourceWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +DXILResourceWrapperPass::~DXILResourceWrapperPass() = default; + +void DXILResourceWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +bool DXILResourceWrapperPass::runOnModule(Module &M) { + ResourceMap.reset(new DXILResourceMap()); + ResourceMapper(M, *ResourceMap).mapResources(); + return false; +} + +void DXILResourceWrapperPass::releaseMemory() { ResourceMap.reset(); } + +void DXILResourceWrapperPass::print(raw_ostream &OS, const Module *) const { + if (!ResourceMap) { + OS << "No resource map has been built!\n"; + return; + } + for (const auto &[Handle, Info] : *ResourceMap) { + OS << "Binding for "; + Handle->print(OS); + OS << "\n"; + Info.print(OS); + OS << "\n"; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void DXILResourceWrapperPass::dump() const { print(dbgs(), nullptr); } +#endif + +INITIALIZE_PASS(DXILResourceWrapperPass, DEBUG_TYPE, "DXIL Resource analysis", + false, true) +char DXILResourceWrapperPass::ID = 0; + +ModulePass *llvm::createDXILResourceWrapperPassPass() { + return new DXILResourceWrapperPass(); +} diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 8e44d548cb56f2..a4a98ea0bae146 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -2450,8 +2450,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst, const SCEVConstant *Constant = dyn_cast(Delta); if (const SCEVAddExpr *Sum = dyn_cast(Delta)) { // If Delta is a sum of products, we may be able to make further progress. - for (unsigned Op = 0, Ops = Sum->getNumOperands(); Op < Ops; Op++) { - const SCEV *Operand = Sum->getOperand(Op); + for (const SCEV *Operand : Sum->operands()) { if (isa(Operand)) { assert(!Constant && "Surprised to find multiple constants"); Constant = cast(Operand); diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 310a7eafc81500..47d3dac73083ee 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueTracking.h" diff --git a/llvm/lib/Analysis/RegionInfo.cpp b/llvm/lib/Analysis/RegionInfo.cpp index 9be23a374eca5a..15257b4a9a926f 100644 --- a/llvm/lib/Analysis/RegionInfo.cpp +++ b/llvm/lib/Analysis/RegionInfo.cpp @@ -15,6 +15,7 @@ #ifndef NDEBUG #include "llvm/Analysis/RegionPrinter.h" #endif +#include "llvm/Analysis/Passes.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Function.h" diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index af341c55205de8..487844f000ac69 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6664,17 +6664,17 @@ const ConstantRange &ScalarEvolution::getRangeRef( WrapType |= OBO::NoSignedWrap; if (Add->hasNoUnsignedWrap()) WrapType |= OBO::NoUnsignedWrap; - for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i) - X = X.addWithNoWrap(getRangeRef(Add->getOperand(i), SignHint, Depth + 1), - WrapType, RangeType); + for (const SCEV *Op : drop_begin(Add->operands())) + X = X.addWithNoWrap(getRangeRef(Op, SignHint, Depth + 1), WrapType, + RangeType); return setRange(Add, SignHint, ConservativeResult.intersectWith(X, RangeType)); } case scMulExpr: { const SCEVMulExpr *Mul = cast(S); ConstantRange X = getRangeRef(Mul->getOperand(0), SignHint, Depth + 1); - for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i) - X = X.multiply(getRangeRef(Mul->getOperand(i), SignHint, Depth + 1)); + for (const SCEV *Op : drop_begin(Mul->operands())) + X = X.multiply(getRangeRef(Op, SignHint, Depth + 1)); return setRange(Mul, SignHint, ConservativeResult.intersectWith(X, RangeType)); } diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 8aa4345cfd6df6..c31ba6b31ad9ac 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -196,11 +196,13 @@ class GlobalMerge : public FunctionPass { } explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset, - bool OnlyOptimizeForSize, bool MergeExternalGlobals) + bool OnlyOptimizeForSize, bool MergeExternalGlobals, + bool MergeConstantGlobals) : FunctionPass(ID), TM(TM) { Opt.MaxOffset = MaximalOffset; Opt.SizeOnly = OnlyOptimizeForSize; Opt.MergeExternal = MergeExternalGlobals; + Opt.MergeConstantGlobals = MergeConstantGlobals; initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -475,7 +477,8 @@ bool GlobalMergeImpl::doMerge(const SmallVectorImpl &Globals, auto &DL = M.getDataLayout(); LLVM_DEBUG(dbgs() << " Trying to merge set, starts with #" - << GlobalSet.find_first() << "\n"); + << GlobalSet.find_first() << ", total of " << Globals.size() + << "\n"); bool Changed = false; ssize_t i = GlobalSet.find_first(); @@ -551,6 +554,8 @@ bool GlobalMergeImpl::doMerge(const SmallVectorImpl &Globals, MergedGV->setAlignment(MaxAlign); MergedGV->setSection(Globals[i]->getSection()); + LLVM_DEBUG(dbgs() << "MergedGV: " << *MergedGV << "\n"); + const StructLayout *MergedLayout = DL.getStructLayout(MergedTy); for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) { GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage(); @@ -700,6 +705,11 @@ bool GlobalMergeImpl::run(Module &M) { else Globals[{AddressSpace, Section}].push_back(&GV); } + LLVM_DEBUG(dbgs() << "GV " + << ((DL.getTypeAllocSize(Ty) < Opt.MaxOffset) + ? "to merge: " + : "not to merge: ") + << GV << "\n"); } for (auto &P : Globals) @@ -710,7 +720,7 @@ bool GlobalMergeImpl::run(Module &M) { if (P.second.size() > 1) Changed |= doMerge(P.second, M, false, P.first.first); - if (EnableGlobalMergeOnConst) + if (Opt.MergeConstantGlobals) for (auto &P : ConstGlobals) if (P.second.size() > 1) Changed |= doMerge(P.second, M, true, P.first.first); @@ -720,8 +730,11 @@ bool GlobalMergeImpl::run(Module &M) { Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset, bool OnlyOptimizeForSize, - bool MergeExternalByDefault) { + bool MergeExternalByDefault, + bool MergeConstantByDefault) { bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ? MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE); - return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal); + bool MergeConstant = EnableGlobalMergeOnConst || MergeConstantByDefault; + return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal, + MergeConstant); } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 608d17072c39f2..25644c24855a62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1939,7 +1939,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINIMUM: - case ISD::FMAXIMUM: return visitFMinMax(N); + case ISD::FMAXIMUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: return visitFMinMax(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); case ISD::FFREXP: return visitFFREXP(N); @@ -2278,9 +2280,7 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { AddUsersToWorklist(N); do { // Do as a single replacement to avoid rewalking use lists. - SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - Ops.push_back(N->getOperand(i)); + SmallVector Ops(N->ops()); DAG.ReplaceAllUsesWith(N, Ops.data()); } while (!N->use_empty()); deleteAndRecombine(N); @@ -6070,6 +6070,7 @@ static bool arebothOperandsNotNan(SDValue Operand1, SDValue Operand2, return DAG.isKnownNeverNaN(Operand2) && DAG.isKnownNeverNaN(Operand1); } +// FIXME: use FMINIMUMNUM if possible, such as for RISC-V. static unsigned getMinMaxOpcodeForFP(SDValue Operand1, SDValue Operand2, ISD::CondCode CC, unsigned OrAndOpcode, SelectionDAG &DAG, @@ -27109,7 +27110,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { /// If a vector binop is performed on splat values, it may be profitable to /// extract, scalarize, and insert/splat. static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, - const SDLoc &DL) { + const SDLoc &DL, bool LegalTypes) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned Opcode = N->getOpcode(); @@ -27131,7 +27132,12 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, Src0.getValueType().getVectorElementType() != EltVT || Src1.getValueType().getVectorElementType() != EltVT || !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) || - !TLI.isOperationLegalOrCustom(Opcode, EltVT)) + // If before type legalization, allow scalar types that will eventually be + // made legal. + !TLI.isOperationLegalOrCustom( + Opcode, LegalTypes + ? EltVT + : TLI.getTypeToTransformTo(*DAG.getContext(), EltVT))) return SDValue(); SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); @@ -27297,7 +27303,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) { } } - if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL)) + if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL, LegalTypes)) return V; return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3eadfbf51ddaa1..e7f765382b0e46 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3660,6 +3660,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Expanded); break; } + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: { + Results.push_back(TLI.expandFMINIMUMNUM_FMAXIMUMNUM(Node, DAG)); + break; + } case ISD::FSIN: case ISD::FCOS: { EVT VT = Node->getValueType(0); @@ -4539,6 +4544,16 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::FMAX_F80, RTLIB::FMAX_F128, RTLIB::FMAX_PPCF128, Results); break; + case ISD::FMINIMUMNUM: + ExpandFPLibCall(Node, RTLIB::FMINIMUMNUM_F32, RTLIB::FMINIMUMNUM_F64, + RTLIB::FMINIMUMNUM_F80, RTLIB::FMINIMUMNUM_F128, + RTLIB::FMINIMUMNUM_PPCF128, Results); + break; + case ISD::FMAXIMUMNUM: + ExpandFPLibCall(Node, RTLIB::FMAXIMUMNUM_F32, RTLIB::FMAXIMUMNUM_F64, + RTLIB::FMAXIMUMNUM_F80, RTLIB::FMAXIMUMNUM_F128, + RTLIB::FMAXIMUMNUM_PPCF128, Results); + break; case ISD::FSQRT: case ISD::STRICT_FSQRT: ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, @@ -5464,6 +5479,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FMAXNUM: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: case ISD::FPOW: Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 0c881d81a2c639..ad0c054d3ccd50 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -74,6 +74,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMINNUM: R = SoftenFloatRes_FMINNUM(N); break; case ISD::STRICT_FMAXNUM: case ISD::FMAXNUM: R = SoftenFloatRes_FMAXNUM(N); break; + case ISD::FMINIMUMNUM: R = SoftenFloatRes_FMINIMUMNUM(N); break; + case ISD::FMAXIMUMNUM: R = SoftenFloatRes_FMAXIMUMNUM(N); break; case ISD::STRICT_FADD: case ISD::FADD: R = SoftenFloatRes_FADD(N); break; case ISD::STRICT_FACOS: @@ -323,6 +325,20 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) { RTLIB::FMAX_PPCF128)); } +SDValue DAGTypeLegalizer::SoftenFloatRes_FMINIMUMNUM(SDNode *N) { + return SoftenFloatRes_Binary( + N, GetFPLibCall(N->getValueType(0), RTLIB::FMINIMUMNUM_F32, + RTLIB::FMINIMUMNUM_F64, RTLIB::FMINIMUMNUM_F80, + RTLIB::FMINIMUMNUM_F128, RTLIB::FMINIMUMNUM_PPCF128)); +} + +SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXIMUMNUM(SDNode *N) { + return SoftenFloatRes_Binary( + N, GetFPLibCall(N->getValueType(0), RTLIB::FMAXIMUMNUM_F32, + RTLIB::FMAXIMUMNUM_F64, RTLIB::FMAXIMUMNUM_F80, + RTLIB::FMAXIMUMNUM_F128, RTLIB::FMAXIMUMNUM_PPCF128)); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) { return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), RTLIB::ADD_F32, @@ -1404,6 +1420,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMINNUM: ExpandFloatRes_FMINNUM(N, Lo, Hi); break; case ISD::STRICT_FMAXNUM: case ISD::FMAXNUM: ExpandFloatRes_FMAXNUM(N, Lo, Hi); break; + case ISD::FMINIMUMNUM: ExpandFloatRes_FMINIMUMNUM(N, Lo, Hi); break; + case ISD::FMAXIMUMNUM: ExpandFloatRes_FMAXIMUMNUM(N, Lo, Hi); break; case ISD::STRICT_FADD: case ISD::FADD: ExpandFloatRes_FADD(N, Lo, Hi); break; case ISD::STRICT_FACOS: @@ -1558,6 +1576,26 @@ void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo, RTLIB::FMAX_PPCF128), Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FMINIMUMNUM(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Binary( + N, + GetFPLibCall(N->getValueType(0), RTLIB::FMINIMUMNUM_F32, + RTLIB::FMINIMUMNUM_F64, RTLIB::FMINIMUMNUM_F80, + RTLIB::FMINIMUMNUM_F128, RTLIB::FMINIMUMNUM_PPCF128), + Lo, Hi); +} + +void DAGTypeLegalizer::ExpandFloatRes_FMAXIMUMNUM(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Binary( + N, + GetFPLibCall(N->getValueType(0), RTLIB::FMAXIMUMNUM_F32, + RTLIB::FMAXIMUMNUM_F64, RTLIB::FMAXIMUMNUM_F80, + RTLIB::FMAXIMUMNUM_F128, RTLIB::FMAXIMUMNUM_PPCF128), + Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo, SDValue &Hi) { ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), @@ -2621,6 +2659,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FDIV: case ISD::FMAXIMUM: case ISD::FMINIMUM: + case ISD::FMAXIMUMNUM: + case ISD::FMINIMUMNUM: case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: @@ -3063,6 +3103,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FDIV: case ISD::FMAXIMUM: case ISD::FMINIMUM: + case ISD::FMAXIMUMNUM: + case ISD::FMINIMUMNUM: case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMUL: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 3a49a8ff10860a..6de1e3eca7feda 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -567,6 +567,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_FATAN(SDNode *N); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); + SDValue SoftenFloatRes_FMINIMUMNUM(SDNode *N); + SDValue SoftenFloatRes_FMAXIMUMNUM(SDNode *N); SDValue SoftenFloatRes_FADD(SDNode *N); SDValue SoftenFloatRes_FCBRT(SDNode *N); SDValue SoftenFloatRes_FCEIL(SDNode *N); @@ -659,6 +661,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void ExpandFloatRes_FATAN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FMINIMUMNUM(SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FMAXIMUMNUM(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FADD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FCBRT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FCEIL (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index ab12c3b0e728a8..7bf90ceb93cb4e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5465,7 +5465,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const return false; } case ISD::FMINNUM: - case ISD::FMAXNUM: { + case ISD::FMAXNUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: { // Only one needs to be known not-nan, since it will be returned if the // other ends up being one. return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) || @@ -6804,6 +6806,10 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL, return getConstantFP(minimum(C1, C2), DL, VT); case ISD::FMAXIMUM: return getConstantFP(maximum(C1, C2), DL, VT); + case ISD::FMINIMUMNUM: + return getConstantFP(minimumnum(C1, C2), DL, VT); + case ISD::FMAXIMUMNUM: + return getConstantFP(maximumnum(C1, C2), DL, VT); default: break; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 37ba62911ec70b..7cdd3d47b641d7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6882,6 +6882,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags)); return; + case Intrinsic::minimumnum: + setValue(&I, DAG.getNode(ISD::FMINIMUMNUM, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), Flags)); + return; + case Intrinsic::maximumnum: + setValue(&I, DAG.getNode(ISD::FMAXIMUMNUM, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), Flags)); + return; case Intrinsic::copysign: setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl, getValue(I.getArgOperand(0)).getValueType(), @@ -9257,6 +9269,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitBinaryFloatCall(I, ISD::FMAXNUM)) return; break; + case LibFunc_fminimum_num: + case LibFunc_fminimum_numf: + case LibFunc_fminimum_numl: + if (visitBinaryFloatCall(I, ISD::FMINIMUMNUM)) + return; + break; + case LibFunc_fmaximum_num: + case LibFunc_fmaximum_numf: + case LibFunc_fmaximum_numl: + if (visitBinaryFloatCall(I, ISD::FMAXIMUMNUM)) + return; + break; case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinl: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 46e8e54ee4ed7d..001f782f209fdb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -203,6 +203,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::STRICT_FMINIMUM: return "strict_fminimum"; case ISD::FMAXIMUM: return "fmaximum"; case ISD::STRICT_FMAXIMUM: return "strict_fmaximum"; + case ISD::FMINIMUMNUM: return "fminimumnum"; + case ISD::FMAXIMUMNUM: return "fmaximumnum"; case ISD::FNEG: return "fneg"; case ISD::FSQRT: return "fsqrt"; case ISD::STRICT_FSQRT: return "strict_fsqrt"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c4f4261a708fda..2c939967a5e1d9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1958,6 +1958,22 @@ bool TargetLowering::SimplifyDemandedBits( } } + // If this is (srl (sra X, C1), ShAmt), see if we can combine this into a + // single sra. We can do this if the top bits are never demanded. + if (Op0.getOpcode() == ISD::SRA && Op0.hasOneUse()) { + if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) { + if (std::optional InnerSA = + TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) { + unsigned C1 = *InnerSA; + // Clamp the combined shift amount if it exceeds the bit width. + unsigned Combined = std::min(C1 + ShAmt, BitWidth - 1); + SDValue NewSA = TLO.DAG.getConstant(Combined, dl, ShiftVT); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRA, dl, VT, + Op0.getOperand(0), NewSA)); + } + } + } + APInt InDemandedMask = (DemandedBits << ShAmt); // If the shift is exact, then it does demand the low bits (and knows that @@ -4901,7 +4917,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && N0.getOpcode() == ISD::SRA && isa(N0.getOperand(1)) && N0.getConstantOperandAPInt(1) == OpVT.getScalarSizeInBits() - 1 && - N1C && N1C->isAllOnes()) { + N1C->isAllOnes()) { return DAG.getSetCC(dl, VT, N0.getOperand(0), DAG.getConstant(0, dl, OpVT), Cond == ISD::SETEQ ? ISD::SETLT : ISD::SETGE); @@ -8542,6 +8558,94 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, return MinMax; } +SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, + SelectionDAG &DAG) const { + SDLoc DL(Node); + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + unsigned Opc = Node->getOpcode(); + EVT VT = Node->getValueType(0); + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + bool IsMax = Opc == ISD::FMAXIMUMNUM; + const TargetOptions &Options = DAG.getTarget().Options; + SDNodeFlags Flags = Node->getFlags(); + + unsigned NewOp = + Opc == ISD::FMINIMUMNUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + + if (isOperationLegalOrCustom(NewOp, VT)) { + if (!Flags.hasNoNaNs()) { + // Insert canonicalizes if it's possible we need to quiet to get correct + // sNaN behavior. + if (!DAG.isKnownNeverSNaN(LHS)) { + LHS = DAG.getNode(ISD::FCANONICALIZE, DL, VT, LHS, Flags); + } + if (!DAG.isKnownNeverSNaN(RHS)) { + RHS = DAG.getNode(ISD::FCANONICALIZE, DL, VT, RHS, Flags); + } + } + + return DAG.getNode(NewOp, DL, VT, LHS, RHS, Flags); + } + + // We can use FMINIMUM/FMAXIMUM if there is no NaN, since it has + // same behaviors for all of other cases: +0.0 vs -0.0 included. + if (Flags.hasNoNaNs() || + (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS))) { + unsigned IEEE2019Op = + Opc == ISD::FMINIMUMNUM ? ISD::FMINIMUM : ISD::FMAXIMUM; + if (isOperationLegalOrCustom(IEEE2019Op, VT)) + return DAG.getNode(IEEE2019Op, DL, VT, LHS, RHS, Flags); + } + + // FMINNUM/FMAXMUM returns qNaN if either operand is sNaN, and it may return + // either one for +0.0 vs -0.0. + if ((Flags.hasNoNaNs() || + (DAG.isKnownNeverSNaN(LHS) && DAG.isKnownNeverSNaN(RHS))) && + (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) || + DAG.isKnownNeverZeroFloat(RHS))) { + unsigned IEEE2008Op = Opc == ISD::FMINIMUMNUM ? ISD::FMINNUM : ISD::FMAXNUM; + if (isOperationLegalOrCustom(IEEE2008Op, VT)) + return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags); + } + + // If only one operand is NaN, override it with another operand. + if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS)) { + LHS = DAG.getSelectCC(DL, LHS, LHS, RHS, LHS, ISD::SETUO); + } + if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(RHS)) { + RHS = DAG.getSelectCC(DL, RHS, RHS, LHS, RHS, ISD::SETUO); + } + + SDValue MinMax = + DAG.getSelectCC(DL, LHS, RHS, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT); + // If MinMax is NaN, let's quiet it. + if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) && + !DAG.isKnownNeverNaN(RHS)) { + SDValue MinMaxQuiet = + DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags); + MinMax = + DAG.getSelectCC(DL, MinMax, MinMax, MinMaxQuiet, MinMax, ISD::SETUO); + } + + // Fixup signed zero behavior. + if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() || + DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) { + return MinMax; + } + SDValue TestZero = + DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32); + SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax, + DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ); + SDValue LCmp = DAG.getSelect( + DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS, + MinMax, Flags); + SDValue RCmp = DAG.getSelect( + DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp, + Flags); + return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags); +} + /// Returns a true value if if this FPClassTest can be performed with an ordered /// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns /// std::nullopt if it cannot be performed as a compare with 0. diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 149b5dabee0565..4ff8617f740c89 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -713,6 +713,7 @@ void TargetLoweringBase::initActions() { ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUM, ISD::FMAXIMUM, + ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM, ISD::FMAD, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::ABS, diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 83fec194d73904..f9b070e6f1eae4 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2022,8 +2022,8 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); } - llvm::for_each(llvm::reverse(ToBeDeleted), - [](Instruction *I) { I->eraseFromParent(); }); + for (Instruction *I : llvm::reverse(ToBeDeleted)) + I->eraseFromParent(); }; addOutlineInfo(std::move(OI)); @@ -7049,8 +7049,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask( } StaleCI->eraseFromParent(); - llvm::for_each(llvm::reverse(ToBeDeleted), - [](Instruction *I) { I->eraseFromParent(); }); + for (Instruction *I : llvm::reverse(ToBeDeleted)) + I->eraseFromParent(); }; addOutlineInfo(std::move(OI)); @@ -8345,9 +8345,8 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, omp::RuntimeFunction::OMPRTL___kmpc_fork_teams), Args); - llvm::for_each(llvm::reverse(ToBeDeleted), - [](Instruction *I) { I->eraseFromParent(); }); - + for (Instruction *I : llvm::reverse(ToBeDeleted)) + I->eraseFromParent(); }; if (!Config.isTargetDevice()) diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 530979c75063b4..44cd1e69818953 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -139,53 +139,20 @@ class StructLayoutMap { } // end anonymous namespace //===----------------------------------------------------------------------===// -// LayoutAlignElem, LayoutAlign support -//===----------------------------------------------------------------------===// - -LayoutAlignElem LayoutAlignElem::get(Align ABIAlign, Align PrefAlign, - uint32_t BitWidth) { - assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!"); - LayoutAlignElem retval; - retval.ABIAlign = ABIAlign; - retval.PrefAlign = PrefAlign; - retval.TypeBitWidth = BitWidth; - return retval; -} - -bool LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const { - return ABIAlign == rhs.ABIAlign && PrefAlign == rhs.PrefAlign && - TypeBitWidth == rhs.TypeBitWidth; -} - -//===----------------------------------------------------------------------===// -// PointerAlignElem, PointerAlign support +// DataLayout Class Implementation //===----------------------------------------------------------------------===// -PointerAlignElem PointerAlignElem::getInBits(uint32_t AddressSpace, - Align ABIAlign, Align PrefAlign, - uint32_t TypeBitWidth, - uint32_t IndexBitWidth) { - assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!"); - PointerAlignElem retval; - retval.AddressSpace = AddressSpace; - retval.ABIAlign = ABIAlign; - retval.PrefAlign = PrefAlign; - retval.TypeBitWidth = TypeBitWidth; - retval.IndexBitWidth = IndexBitWidth; - return retval; +bool DataLayout::PrimitiveSpec::operator==(const PrimitiveSpec &Other) const { + return BitWidth == Other.BitWidth && ABIAlign == Other.ABIAlign && + PrefAlign == Other.PrefAlign; } -bool -PointerAlignElem::operator==(const PointerAlignElem &rhs) const { - return (ABIAlign == rhs.ABIAlign && AddressSpace == rhs.AddressSpace && - PrefAlign == rhs.PrefAlign && TypeBitWidth == rhs.TypeBitWidth && - IndexBitWidth == rhs.IndexBitWidth); +bool DataLayout::PointerSpec::operator==(const PointerSpec &Other) const { + return AddrSpace == Other.AddrSpace && BitWidth == Other.BitWidth && + ABIAlign == Other.ABIAlign && PrefAlign == Other.PrefAlign && + IndexBitWidth == Other.IndexBitWidth; } -//===----------------------------------------------------------------------===// -// DataLayout Class Implementation -//===----------------------------------------------------------------------===// - const char *DataLayout::getManglingComponent(const Triple &T) { if (T.isOSBinFormatGOFF()) return "-m:l"; @@ -200,34 +167,34 @@ const char *DataLayout::getManglingComponent(const Triple &T) { // Default primitive type specifications. // NOTE: These arrays must be sorted by type bit width. -constexpr LayoutAlignElem DefaultIntSpecs[] = { +constexpr DataLayout::PrimitiveSpec DefaultIntSpecs[] = { {1, Align::Constant<1>(), Align::Constant<1>()}, // i1:8:8 {8, Align::Constant<1>(), Align::Constant<1>()}, // i8:8:8 {16, Align::Constant<2>(), Align::Constant<2>()}, // i16:16:16 {32, Align::Constant<4>(), Align::Constant<4>()}, // i32:32:32 {64, Align::Constant<4>(), Align::Constant<8>()}, // i64:32:64 }; -constexpr LayoutAlignElem DefaultFloatSpecs[] = { +constexpr DataLayout::PrimitiveSpec DefaultFloatSpecs[] = { {16, Align::Constant<2>(), Align::Constant<2>()}, // f16:16:16 {32, Align::Constant<4>(), Align::Constant<4>()}, // f32:32:32 {64, Align::Constant<8>(), Align::Constant<8>()}, // f64:64:64 {128, Align::Constant<16>(), Align::Constant<16>()}, // f128:128:128 }; -constexpr LayoutAlignElem DefaultVectorSpecs[] = { +constexpr DataLayout::PrimitiveSpec DefaultVectorSpecs[] = { {64, Align::Constant<8>(), Align::Constant<8>()}, // v64:64:64 {128, Align::Constant<16>(), Align::Constant<16>()}, // v128:128:128 }; // Default pointer type specifications. -constexpr PointerAlignElem DefaultPointerSpecs[] = { +constexpr DataLayout::PointerSpec DefaultPointerSpecs[] = { {0, 64, Align::Constant<8>(), Align::Constant<8>(), 64} // p0:64:64:64:64 }; DataLayout::DataLayout() - : IntAlignments(ArrayRef(DefaultIntSpecs)), - FloatAlignments(ArrayRef(DefaultFloatSpecs)), - VectorAlignments(ArrayRef(DefaultVectorSpecs)), - Pointers(ArrayRef(DefaultPointerSpecs)) {} + : IntSpecs(ArrayRef(DefaultIntSpecs)), + FloatSpecs(ArrayRef(DefaultFloatSpecs)), + VectorSpecs(ArrayRef(DefaultVectorSpecs)), + PointerSpecs(ArrayRef(DefaultPointerSpecs)) {} DataLayout::DataLayout(StringRef LayoutString) : DataLayout() { if (Error Err = parseSpecifier(LayoutString)) @@ -247,10 +214,10 @@ DataLayout &DataLayout::operator=(const DataLayout &Other) { TheFunctionPtrAlignType = Other.TheFunctionPtrAlignType; ManglingMode = Other.ManglingMode; LegalIntWidths = Other.LegalIntWidths; - IntAlignments = Other.IntAlignments; - FloatAlignments = Other.FloatAlignments; - VectorAlignments = Other.VectorAlignments; - Pointers = Other.Pointers; + IntSpecs = Other.IntSpecs; + FloatSpecs = Other.FloatSpecs; + VectorSpecs = Other.VectorSpecs; + PointerSpecs = Other.PointerSpecs; StructABIAlignment = Other.StructABIAlignment; StructPrefAlignment = Other.StructPrefAlignment; NonIntegralAddressSpaces = Other.NonIntegralAddressSpaces; @@ -268,11 +235,9 @@ bool DataLayout::operator==(const DataLayout &Other) const { FunctionPtrAlign == Other.FunctionPtrAlign && TheFunctionPtrAlignType == Other.TheFunctionPtrAlignType && ManglingMode == Other.ManglingMode && - LegalIntWidths == Other.LegalIntWidths && - IntAlignments == Other.IntAlignments && - FloatAlignments == Other.FloatAlignments && - VectorAlignments == Other.VectorAlignments && - Pointers == Other.Pointers && + LegalIntWidths == Other.LegalIntWidths && IntSpecs == Other.IntSpecs && + FloatSpecs == Other.FloatSpecs && VectorSpecs == Other.VectorSpecs && + PointerSpecs == Other.PointerSpecs && StructABIAlignment == Other.StructABIAlignment && StructPrefAlignment == Other.StructPrefAlignment; } @@ -361,10 +326,10 @@ Error DataLayout::parseSpecifier(StringRef Desc) { continue; } - char Specifier = Tok.front(); + char SpecifierChar = Tok.front(); Tok = Tok.substr(1); - switch (Specifier) { + switch (SpecifierChar) { case 's': // Deprecated, but ignoring here to preserve loading older textual llvm // ASM file @@ -433,9 +398,9 @@ Error DataLayout::parseSpecifier(StringRef Desc) { return reportError("Invalid index size of 0 bytes"); } } - if (Error Err = setPointerAlignmentInBits( - AddrSpace, assumeAligned(PointerABIAlign), - assumeAligned(PointerPrefAlign), PointerMemSize, IndexSize)) + if (Error Err = setPointerSpec( + AddrSpace, PointerMemSize, assumeAligned(PointerABIAlign), + assumeAligned(PointerPrefAlign), IndexSize)) return Err; break; } @@ -443,13 +408,22 @@ Error DataLayout::parseSpecifier(StringRef Desc) { case 'v': case 'f': case 'a': { - AlignTypeEnum AlignType; - switch (Specifier) { - default: llvm_unreachable("Unexpected specifier!"); - case 'i': AlignType = INTEGER_ALIGN; break; - case 'v': AlignType = VECTOR_ALIGN; break; - case 'f': AlignType = FLOAT_ALIGN; break; - case 'a': AlignType = AGGREGATE_ALIGN; break; + TypeSpecifier Specifier; + switch (SpecifierChar) { + default: + llvm_unreachable("Unexpected specifier!"); + case 'i': + Specifier = TypeSpecifier::Integer; + break; + case 'v': + Specifier = TypeSpecifier::Vector; + break; + case 'f': + Specifier = TypeSpecifier::Float; + break; + case 'a': + Specifier = TypeSpecifier::Aggregate; + break; } // Bit size. @@ -458,7 +432,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) { if (Error Err = getInt(Tok, Size)) return Err; - if (AlignType == AGGREGATE_ALIGN && Size != 0) + if (Specifier == TypeSpecifier::Aggregate && Size != 0) return reportError( "Sized aggregate specification in datalayout string"); @@ -471,7 +445,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) { unsigned ABIAlign; if (Error Err = getIntInBytes(Tok, ABIAlign)) return Err; - if (AlignType != AGGREGATE_ALIGN && !ABIAlign) + if (Specifier != TypeSpecifier::Aggregate && !ABIAlign) return reportError( "ABI alignment specification must be >0 for non-aggregate types"); @@ -479,7 +453,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) { return reportError("Invalid ABI alignment, must be a 16bit integer"); if (ABIAlign != 0 && !isPowerOf2_64(ABIAlign)) return reportError("Invalid ABI alignment, must be a power of 2"); - if (AlignType == INTEGER_ALIGN && Size == 8 && ABIAlign != 1) + if (Specifier == TypeSpecifier::Integer && Size == 8 && ABIAlign != 1) return reportError( "Invalid ABI alignment, i8 must be naturally aligned"); @@ -498,8 +472,8 @@ Error DataLayout::parseSpecifier(StringRef Desc) { if (PrefAlign != 0 && !isPowerOf2_64(PrefAlign)) return reportError("Invalid preferred alignment, must be a power of 2"); - if (Error Err = setAlignment(AlignType, assumeAligned(ABIAlign), - assumeAligned(PrefAlign), Size)) + if (Error Err = setPrimitiveSpec(Specifier, Size, assumeAligned(ABIAlign), + assumeAligned(PrefAlign))) return Err; break; @@ -607,16 +581,17 @@ Error DataLayout::parseSpecifier(StringRef Desc) { return Error::success(); } -static SmallVectorImpl::const_iterator -findAlignmentLowerBound(const SmallVectorImpl &Alignments, - uint32_t BitWidth) { - return partition_point(Alignments, [BitWidth](const LayoutAlignElem &E) { - return E.TypeBitWidth < BitWidth; +static SmallVectorImpl::const_iterator +findPrimitiveSpecLowerBound( + const SmallVectorImpl &Specs, + uint32_t BitWidth) { + return partition_point(Specs, [BitWidth](const DataLayout::PrimitiveSpec &E) { + return E.BitWidth < BitWidth; }); } -Error DataLayout::setAlignment(AlignTypeEnum AlignType, Align ABIAlign, - Align PrefAlign, uint32_t BitWidth) { +Error DataLayout::setPrimitiveSpec(TypeSpecifier Specifier, uint32_t BitWidth, + Align ABIAlign, Align PrefAlign) { // AlignmentsTy::ABIAlign and AlignmentsTy::PrefAlign were once stored as // uint16_t, it is unclear if there are requirements for alignment to be less // than 2^16 other than storage. In the meantime we leave the restriction as @@ -628,74 +603,72 @@ Error DataLayout::setAlignment(AlignTypeEnum AlignType, Align ABIAlign, return reportError( "Preferred alignment cannot be less than the ABI alignment"); - SmallVectorImpl *Alignments; - switch (AlignType) { - case AGGREGATE_ALIGN: + SmallVectorImpl *Specs; + switch (Specifier) { + case TypeSpecifier::Aggregate: StructABIAlignment = ABIAlign; StructPrefAlignment = PrefAlign; return Error::success(); - case INTEGER_ALIGN: - Alignments = &IntAlignments; + case TypeSpecifier::Integer: + Specs = &IntSpecs; break; - case FLOAT_ALIGN: - Alignments = &FloatAlignments; + case TypeSpecifier::Float: + Specs = &FloatSpecs; break; - case VECTOR_ALIGN: - Alignments = &VectorAlignments; + case TypeSpecifier::Vector: + Specs = &VectorSpecs; break; } - auto I = partition_point(*Alignments, [BitWidth](const LayoutAlignElem &E) { - return E.TypeBitWidth < BitWidth; + auto I = partition_point(*Specs, [BitWidth](const PrimitiveSpec &E) { + return E.BitWidth < BitWidth; }); - if (I != Alignments->end() && I->TypeBitWidth == BitWidth) { + if (I != Specs->end() && I->BitWidth == BitWidth) { // Update the abi, preferred alignments. I->ABIAlign = ABIAlign; I->PrefAlign = PrefAlign; } else { // Insert before I to keep the vector sorted. - Alignments->insert(I, LayoutAlignElem::get(ABIAlign, PrefAlign, BitWidth)); + Specs->insert(I, PrimitiveSpec{BitWidth, ABIAlign, PrefAlign}); } return Error::success(); } -const PointerAlignElem & -DataLayout::getPointerAlignElem(uint32_t AddressSpace) const { - if (AddressSpace != 0) { - auto I = lower_bound(Pointers, AddressSpace, - [](const PointerAlignElem &A, uint32_t AddressSpace) { - return A.AddressSpace < AddressSpace; - }); - if (I != Pointers.end() && I->AddressSpace == AddressSpace) +const DataLayout::PointerSpec & +DataLayout::getPointerSpec(uint32_t AddrSpace) const { + if (AddrSpace != 0) { + auto I = lower_bound(PointerSpecs, AddrSpace, + [](const PointerSpec &Spec, uint32_t AddrSpace) { + return Spec.AddrSpace < AddrSpace; + }); + if (I != PointerSpecs.end() && I->AddrSpace == AddrSpace) return *I; } - assert(Pointers[0].AddressSpace == 0); - return Pointers[0]; + assert(PointerSpecs[0].AddrSpace == 0); + return PointerSpecs[0]; } -Error DataLayout::setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign, - Align PrefAlign, - uint32_t TypeBitWidth, - uint32_t IndexBitWidth) { +Error DataLayout::setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, + Align ABIAlign, Align PrefAlign, + uint32_t IndexBitWidth) { if (PrefAlign < ABIAlign) return reportError( "Preferred alignment cannot be less than the ABI alignment"); - if (IndexBitWidth > TypeBitWidth) + if (IndexBitWidth > BitWidth) return reportError("Index width cannot be larger than pointer width"); - auto I = lower_bound(Pointers, AddrSpace, - [](const PointerAlignElem &A, uint32_t AddressSpace) { - return A.AddressSpace < AddressSpace; - }); - if (I == Pointers.end() || I->AddressSpace != AddrSpace) { - Pointers.insert(I, - PointerAlignElem::getInBits(AddrSpace, ABIAlign, PrefAlign, - TypeBitWidth, IndexBitWidth)); + auto I = lower_bound(PointerSpecs, AddrSpace, + [](const PointerSpec &A, uint32_t AddrSpace) { + return A.AddrSpace < AddrSpace; + }); + if (I == PointerSpecs.end() || I->AddrSpace != AddrSpace) { + PointerSpecs.insert(I, PointerSpec{AddrSpace, BitWidth, ABIAlign, PrefAlign, + IndexBitWidth}); } else { + I->BitWidth = BitWidth; I->ABIAlign = ABIAlign; I->PrefAlign = PrefAlign; - I->TypeBitWidth = TypeBitWidth; I->IndexBitWidth = IndexBitWidth; } return Error::success(); @@ -703,11 +676,11 @@ Error DataLayout::setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign, Align DataLayout::getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const { - auto I = findAlignmentLowerBound(IntAlignments, BitWidth); + auto I = findPrimitiveSpecLowerBound(IntSpecs, BitWidth); // If we don't have an exact match, use alignment of next larger integer // type. If there is none, use alignment of largest integer type by going // back one element. - if (I == IntAlignments.end()) + if (I == IntSpecs.end()) --I; return abi_or_pref ? I->ABIAlign : I->PrefAlign; } @@ -737,22 +710,22 @@ const StructLayout *DataLayout::getStructLayout(StructType *Ty) const { } Align DataLayout::getPointerABIAlignment(unsigned AS) const { - return getPointerAlignElem(AS).ABIAlign; + return getPointerSpec(AS).ABIAlign; } Align DataLayout::getPointerPrefAlignment(unsigned AS) const { - return getPointerAlignElem(AS).PrefAlign; + return getPointerSpec(AS).PrefAlign; } unsigned DataLayout::getPointerSize(unsigned AS) const { - return divideCeil(getPointerAlignElem(AS).TypeBitWidth, 8); + return divideCeil(getPointerSpec(AS).BitWidth, 8); } unsigned DataLayout::getMaxIndexSize() const { unsigned MaxIndexSize = 0; - for (auto &P : Pointers) + for (const PointerSpec &Spec : PointerSpecs) MaxIndexSize = - std::max(MaxIndexSize, (unsigned)divideCeil(P.TypeBitWidth, 8)); + std::max(MaxIndexSize, (unsigned)divideCeil(Spec.BitWidth, 8)); return MaxIndexSize; } @@ -765,7 +738,7 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const { } unsigned DataLayout::getIndexSize(unsigned AS) const { - return divideCeil(getPointerAlignElem(AS).IndexBitWidth, 8); + return divideCeil(getPointerSpec(AS).IndexBitWidth, 8); } unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const { @@ -819,8 +792,8 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const { case Type::FP128TyID: case Type::X86_FP80TyID: { unsigned BitWidth = getTypeSizeInBits(Ty).getFixedValue(); - auto I = findAlignmentLowerBound(FloatAlignments, BitWidth); - if (I != FloatAlignments.end() && I->TypeBitWidth == BitWidth) + auto I = findPrimitiveSpecLowerBound(FloatSpecs, BitWidth); + if (I != FloatSpecs.end() && I->BitWidth == BitWidth) return abi_or_pref ? I->ABIAlign : I->PrefAlign; // If we still couldn't find a reasonable default alignment, fall back @@ -834,8 +807,8 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const { case Type::FixedVectorTyID: case Type::ScalableVectorTyID: { unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue(); - auto I = findAlignmentLowerBound(VectorAlignments, BitWidth); - if (I != VectorAlignments.end() && I->TypeBitWidth == BitWidth) + auto I = findPrimitiveSpecLowerBound(VectorSpecs, BitWidth); + if (I != VectorSpecs.end() && I->BitWidth == BitWidth) return abi_or_pref ? I->ABIAlign : I->PrefAlign; // By default, use natural alignment for vector types. This is consistent diff --git a/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp index e3f4cdd21557e2..52cf1ff1376d08 100644 --- a/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp +++ b/llvm/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp @@ -10,6 +10,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 1859fde279c98d..c528863d1870a4 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/DDG.h" #include "llvm/Analysis/DDGPrinter.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" +#include "llvm/Analysis/DXILResource.h" #include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/DependenceAnalysis.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6927a2886b962b..0201e69f3e216a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -1196,6 +1197,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // In pre-link, we just want the instrumented IR. We use the contextual // profile in the post-thinlink phase. // The instrumentation will be removed in post-thinlink after IPO. + // FIXME(mtrofin): move AssignGUIDPass if there is agreement to use this + // mechanism for GUIDs. + MPM.addPass(AssignGUIDPass()); if (IsCtxProfUse) return MPM; addPostPGOLoopRotation(MPM, Level); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 5ef8ba30944470..18f4aa19224da0 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -22,6 +22,7 @@ MODULE_ANALYSIS("callgraph", CallGraphAnalysis()) MODULE_ANALYSIS("collector-metadata", CollectorMetadataAnalysis()) MODULE_ANALYSIS("ctx-prof-analysis", CtxProfAnalysis(UseCtxProfile)) MODULE_ANALYSIS("dxil-metadata", DXILMetadataAnalysis()) +MODULE_ANALYSIS("dxil-resource", DXILResourceAnalysis()) MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis()) MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis()) MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis()) @@ -45,6 +46,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA()) #endif MODULE_PASS("always-inline", AlwaysInlinerPass()) MODULE_PASS("annotation2metadata", Annotation2MetadataPass()) +MODULE_PASS("assign-guid", AssignGUIDPass()) MODULE_PASS("attributor", AttributorPass()) MODULE_PASS("attributor-light", AttributorLightPass()) MODULE_PASS("called-value-propagation", CalledValuePropagationPass()) @@ -119,6 +121,7 @@ MODULE_PASS("print-must-be-executed-contexts", MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(dbgs())) MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs())) MODULE_PASS("print", DXILMetadataAnalysisPrinterPass(dbgs())) +MODULE_PASS("print", DXILResourcePrinterPass(dbgs())) MODULE_PASS("print", InlineAdvisorAnalysisPrinterPass(dbgs())) MODULE_PASS("print", ModuleDebugInfoPrinterPass(dbgs())) MODULE_PASS("pseudo-probe", SampleProfileProbePass(TM)) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 80809b23e34b2c..3c7dbb70c9f4b7 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -472,6 +472,103 @@ bool Instruction::classof(const sandboxir::Value *From) { } } +void Instruction::setHasNoUnsignedWrap(bool B) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setHasNoUnsignedWrap(B); +} + +void Instruction::setHasNoSignedWrap(bool B) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setHasNoSignedWrap(B); +} + +void Instruction::setFast(bool B) { + Ctx.getTracker() + .emplaceIfTracking< + GenericSetter<&Instruction::isFast, &Instruction::setFast>>(this); + cast(Val)->setFast(B); +} + +void Instruction::setIsExact(bool B) { + Ctx.getTracker() + .emplaceIfTracking< + GenericSetter<&Instruction::isExact, &Instruction::setIsExact>>(this); + cast(Val)->setIsExact(B); +} + +void Instruction::setHasAllowReassoc(bool B) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setHasAllowReassoc(B); +} + +void Instruction::setHasNoNaNs(bool B) { + Ctx.getTracker() + .emplaceIfTracking< + GenericSetter<&Instruction::hasNoNaNs, &Instruction::setHasNoNaNs>>( + this); + cast(Val)->setHasNoNaNs(B); +} + +void Instruction::setHasNoInfs(bool B) { + Ctx.getTracker() + .emplaceIfTracking< + GenericSetter<&Instruction::hasNoInfs, &Instruction::setHasNoInfs>>( + this); + cast(Val)->setHasNoInfs(B); +} + +void Instruction::setHasNoSignedZeros(bool B) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setHasNoSignedZeros(B); +} + +void Instruction::setHasAllowReciprocal(bool B) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setHasAllowReciprocal(B); +} + +void Instruction::setHasAllowContract(bool B) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setHasAllowContract(B); +} + +void Instruction::setFastMathFlags(FastMathFlags FMF) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setFastMathFlags(FMF); +} + +void Instruction::copyFastMathFlags(FastMathFlags FMF) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->copyFastMathFlags(FMF); +} + +void Instruction::setHasApproxFunc(bool B) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setHasApproxFunc(B); +} + #ifndef NDEBUG void Instruction::dumpOS(raw_ostream &OS) const { OS << "Unimplemented! Please override dump()."; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 6cee8979596388..d65e0ae92308e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -14,6 +14,7 @@ #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -1039,13 +1040,26 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, - &AAUnderlyingObjects::ID, &AAAddressSpace::ID}); + &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, + &AAInstanceInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; AC.Allowed = &Allowed; AC.IsModulePass = true; AC.DefaultInitializeLiveInternals = false; + AC.IndirectCalleeSpecializationCallback = + [&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB, + Function &Callee, unsigned NumAssumedCallees) { + if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv())) + return false; + // Singleton functions can be specialized. + if (NumAssumedCallees == 1) + return true; + // Otherwise specialize uniform values. + const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller()); + return TTI.isAlwaysUniform(CB.getCalledOperand()); + }; AC.IPOAmendableCB = [](const Function &F) { return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp index fb3d3259171aca..36f44a20d95532 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp @@ -9,9 +9,17 @@ #include "AMDGPUCodeGenPassBuilder.h" #include "AMDGPU.h" #include "AMDGPUISelDAGToDAG.h" +#include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUUnifyDivergentExitNodes.h" #include "SIFixSGPRCopies.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/Transforms/Scalar/FlattenCFG.h" +#include "llvm/Transforms/Scalar/Sink.h" +#include "llvm/Transforms/Scalar/StructurizeCFG.h" +#include "llvm/Transforms/Utils/FixIrreducible.h" +#include "llvm/Transforms/Utils/LCSSA.h" +#include "llvm/Transforms/Utils/UnifyLoopExits.h" using namespace llvm; @@ -28,8 +36,51 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( } void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { - // TODO: Add passes pre instruction selection. - // Test only, convert to real IR passes in future. + const bool LateCFGStructurize = AMDGPUTargetMachine::EnableLateStructurizeCFG; + const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer; + const bool EnableStructurizerWorkarounds = + AMDGPUTargetMachine::EnableStructurizerWorkarounds; + + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(FlattenCFGPass()); + + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SinkingPass()); + + addPass(AMDGPULateCodeGenPreparePass(TM)); + + // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit + // regions formed by them. + + addPass(AMDGPUUnifyDivergentExitNodesPass()); + + if (!LateCFGStructurize && !DisableStructurizer) { + if (EnableStructurizerWorkarounds) { + addPass(FixIrreduciblePass()); + addPass(UnifyLoopExitsPass()); + } + + addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); + } + + addPass(AMDGPUAnnotateUniformValuesPass()); + + if (!LateCFGStructurize && !DisableStructurizer) { + addPass(SIAnnotateControlFlowPass(TM)); + + // TODO: Move this right after structurizeCFG to avoid extra divergence + // analysis. This depends on stopping SIAnnotateControlFlow from making + // control flow modifications. + addPass(AMDGPURewriteUndefForPHIPass()); + } + + addPass(LCSSAPass()); + + if (TM.getOptLevel() > CodeGenOptLevel::Less) + addPass(AMDGPUPerfHintAnalysisPass(TM)); + + // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why + // isn't this in addInstSelector? addPass(RequireAnalysisPass()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 33474e7de01888..76553e99431c11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1195,14 +1195,10 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( WorkList.push_back(ICmp); } - if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { - // Give up if the pointer may be captured. - if (PointerMayBeCaptured(UseInst, true, true)) - return false; - // Don't collect the users of this. - WorkList.push_back(User); - continue; - } + // TODO: If we know the address is only observed through flat pointers, we + // could still promote. + if (UseInst->getOpcode() == Instruction::AddrSpaceCast) + return false; // Do not promote vector/aggregate type instructions. It is hard to track // their users. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d82be9a7e9041a..b8aa93285ad849 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -338,10 +338,11 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); -static cl::opt EnableStructurizerWorkarounds( +static cl::opt EnableStructurizerWorkarounds( "amdgpu-enable-structurizer-workarounds", - cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), - cl::Hidden); + cl::desc("Enable workarounds for the StructurizeCFG pass"), + cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds), + cl::init(true), cl::Hidden); static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), @@ -616,6 +617,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; bool AMDGPUTargetMachine::DisableStructurizer = false; +bool AMDGPUTargetMachine::EnableStructurizerWorkarounds = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 6bb8788cc73b0c..4d39ad2b415052 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -38,6 +38,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; static bool DisableStructurizer; + static bool EnableStructurizerWorkarounds; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 725f49984483c1..1a10206eea2374 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8546,7 +8546,7 @@ static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) { uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); if (DstOp.isReg() && MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) { - if (AMDGPU::isHi(DstOp.getReg(), MRI)) + if (AMDGPU::isHi16Reg(DstOp.getReg(), MRI)) ModVal |= SISrcMods::DST_OP_SEL; } else { if ((OpSel & (1 << SrcNum)) != 0) @@ -8826,7 +8826,7 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if (SrcOp.isReg() && getMRI() ->getRegClass(AMDGPU::VGPR_16RegClassID) .contains(SrcOp.getReg())) { - bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI()); + bool VGPRSuffixIsHi = AMDGPU::isHi16Reg(SrcOp.getReg(), *getMRI()); if (VGPRSuffixIsHi) ModVal |= SISrcMods::OP_SEL_0; } else { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 2c9d17d448eadd..2af1f919730257 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -608,7 +608,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16( AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); if (VDstMOIdx != -1) { auto DstReg = MI.getOperand(VDstMOIdx).getReg(); - if (AMDGPU::isHi(DstReg, MRI)) + if (AMDGPU::isHi16Reg(DstReg, MRI)) Op |= SISrcMods::DST_OP_SEL; } } else if ((int)OpNo == AMDGPU::getNamedOperandIdx( @@ -626,7 +626,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16( auto SrcReg = SrcMO.getReg(); if (AMDGPU::isSGPR(SrcReg, &MRI)) return; - if (AMDGPU::isHi(SrcReg, MRI)) + if (AMDGPU::isHi16Reg(SrcReg, MRI)) Op |= SISrcMods::OP_SEL_0; } @@ -637,7 +637,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( if (MO.isReg()) { uint16_t Encoding = MRI.getEncodingValue(MO.getReg()); unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK; - bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI; + bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI16; bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR; assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!"); Op = (IsVGPR ? 0x100 : 0) | (IsHi ? 0x80 : 0) | RegIdx; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 2f001db776975f..fb3d83ca30d198 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -371,7 +371,7 @@ enum : unsigned { REG_IDX_MASK = 0xff, IS_VGPR = 1 << 8, IS_AGPR = 1 << 9, - IS_HI = 1 << 10, // High 16-bit register. + IS_HI16 = 1 << 10, }; } // namespace HWEncoding diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 86fc100f1c2da0..25fee559faa29c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16657,18 +16657,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Value *Val = AI->getValOperand(); Type *ValTy = Val->getType(); Value *Addr = AI->getPointerOperand(); - - auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, - Value *Val) -> Value * { - AtomicRMWInst *OldVal = - Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(), - AI->getOrdering(), AI->getSyncScopeID()); - SmallVector> MDs; - AI->getAllMetadata(MDs); - for (auto &P : MDs) - OldVal->setMetadata(P.first, P.second); - return OldVal; - }; + Align Alignment = AI->getAlign(); std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); @@ -16679,7 +16668,13 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.SetInsertPoint(SharedBB); Value *CastToLocal = Builder.CreateAddrSpaceCast( Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); - Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); + + Instruction *Clone = AI->clone(); + Clone->insertInto(SharedBB, SharedBB->end()); + Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToLocal); + Instruction *LoadedShared = Clone; + Builder.CreateBr(PhiBB); Builder.SetInsertPoint(CheckPrivateBB); @@ -16690,34 +16685,38 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.SetInsertPoint(PrivateBB); Value *CastToPrivate = Builder.CreateAddrSpaceCast( Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); - Value *LoadedPrivate = - Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private"); + Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate, + Alignment, "loaded.private"); Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val); - Builder.CreateStore(NewVal, CastToPrivate); + Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment); Builder.CreateBr(PhiBB); Builder.SetInsertPoint(GlobalBB); Value *CastToGlobal = Builder.CreateAddrSpaceCast( Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); - Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); + Value *LoadedGlobal = AI; + + AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal); + + AI->removeFromParent(); + AI->insertInto(GlobalBB, GlobalBB->end()); + Builder.CreateBr(PhiBB); Builder.SetInsertPoint(PhiBB); if (ReturnValueIsUsed) { PHINode *Loaded = Builder.CreatePHI(ValTy, 3); + AI->replaceAllUsesWith(Loaded); Loaded->addIncoming(LoadedShared, SharedBB); Loaded->addIncoming(LoadedPrivate, PrivateBB); Loaded->addIncoming(LoadedGlobal, GlobalBB); Loaded->takeName(AI); - AI->replaceAllUsesWith(Loaded); } Builder.CreateBr(ExitBB); - - AI->eraseFromParent(); } LoadInst * diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8af5c364509f0e..9147242046ceda 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -954,8 +954,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); - bool DstLow = !AMDGPU::isHi(DestReg, RI); - bool SrcLow = !AMDGPU::isHi(SrcReg, RI); + bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI); + bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI); MCRegister NewDestReg = RI.get32BitRegister(DestReg); MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ee72837a50fc43..7523b619748cc7 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -332,7 +332,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) RegPressureIgnoredUnits.resize(getNumRegUnits()); RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); for (auto Reg : AMDGPU::VGPR_16RegClass) { - if (AMDGPU::isHi(Reg, *this)) + if (AMDGPU::isHi16Reg(Reg, *this)) RegPressureIgnoredUnits.set(*regunits(Reg).begin()); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 519048356f764d..d3e39464fea396 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ class SIRegisterTuples Indices, RegisterClass RC, // Declarations that describe the SI registers //===----------------------------------------------------------------------===// class SIReg regIdx = 0, bit isVGPR = 0, - bit isAGPR = 0, bit isHi = 0> : Register { + bit isAGPR = 0, bit isHi16 = 0> : Register { let Namespace = "AMDGPU"; // These are generic helper values we use to form actual register @@ -132,7 +132,7 @@ class SIReg regIdx = 0, bit isVGPR = 0, let HWEncoding{7-0} = regIdx; let HWEncoding{8} = isVGPR; let HWEncoding{9} = isAGPR; - let HWEncoding{10} = isHi; + let HWEncoding{10} = isHi16; int Index = !cast(regIdx); } @@ -161,7 +161,7 @@ multiclass SIRegLoHi16 regIdx, bit ArtificialHigh = 1, bit isVGPR = 0, bit isAGPR = 0> { def _LO16 : SIReg; def _HI16 : SIReg { + /* isHi16 */ 1> { let isArtificial = ArtificialHigh; } def "" : RegisterWithSubRegs(NAME#"_LO16"), diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 96d4863e94014c..0ca6266cc678b6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2237,8 +2237,8 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { Reg == AMDGPU::SCC; } -bool isHi(unsigned Reg, const MCRegisterInfo &MRI) { - return MRI.getEncodingValue(Reg) & AMDGPU::HWEncoding::IS_HI; +bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI) { + return MRI.getEncodingValue(Reg) & AMDGPU::HWEncoding::IS_HI16; } #define MAP_REG2REG \ diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 429c3ad335d213..a4e6a7ebe0558b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1314,8 +1314,7 @@ bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); /// \returns if \p Reg occupies the high 16-bits of a 32-bit register. -/// The bit indicating isHi is the LSB of the encoding. -bool isHi(unsigned Reg, const MCRegisterInfo &MRI); +bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 626321f44c2bfc..e63633b8a1e1ab 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -252,7 +252,6 @@ static bool expandNormalizeIntrinsic(CallInst *Orig) { return true; } - Value *Elt = Builder.CreateExtractElement(X, (uint64_t)0); unsigned XVecSize = XVec->getNumElements(); Value *DotProduct = nullptr; // use the dot intrinsic corresponding to the vector size diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 583bce0f50e700..21089a232783a5 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -29,7 +29,7 @@ class DXILTranslateMetadata : public ModulePass { static char ID; // Pass identification, replacement for typeid explicit DXILTranslateMetadata() : ModulePass(ID) {} - StringRef getPassName() const override { return "DXIL Metadata Emit"; } + StringRef getPassName() const override { return "DXIL Translate Metadata"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); @@ -67,9 +67,9 @@ ModulePass *llvm::createDXILTranslateMetadataPass() { return new DXILTranslateMetadata(); } -INITIALIZE_PASS_BEGIN(DXILTranslateMetadata, "dxil-metadata-emit", - "DXIL Metadata Emit", false, false) +INITIALIZE_PASS_BEGIN(DXILTranslateMetadata, "dxil-translate-metadata", + "DXIL Translate Metadata", false, false) INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper) INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) -INITIALIZE_PASS_END(DXILTranslateMetadata, "dxil-metadata-emit", - "DXIL Metadata Emit", false, false) +INITIALIZE_PASS_END(DXILTranslateMetadata, "dxil-translate-metadata", + "DXIL Translate Metadata", false, false) diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp index e6bc3af6e191a1..78783084ee59a0 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp @@ -236,15 +236,11 @@ void M68kMCCodeEmitter::encodeInstruction(const MCInst &MI, APInt Scratch(64, 0U); // One APInt word is enough. getBinaryCodeForInstr(MI, Fixups, EncodedInst, Scratch, STI); - ArrayRef Data(EncodedInst.getRawData(), EncodedInst.getNumWords()); - int64_t InstSize = EncodedInst.getBitWidth(); - for (uint64_t Word : Data) { - for (int i = 0; i < 4 && InstSize > 0; ++i, InstSize -= 16) { - support::endian::write(CB, static_cast(Word), - llvm::endianness::big); - Word >>= 16; - } - } + unsigned InstSize = EncodedInst.getBitWidth(); + for (unsigned i = 0; i != InstSize; i += 16) + support::endian::write( + CB, static_cast(EncodedInst.extractBitsAsZExtValue(16, i)), + llvm::endianness::big); } MCCodeEmitter *llvm::createM68kMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 25c198f0121e59..4f0bc1a2044642 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1974,8 +1974,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { Ops.append({BasePtr, Chain}); } - SDNode *NVPTXST = NVPTXST = - CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); + SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); if (!NVPTXST) return false; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index d75dc8781f7802..b57c86fcf697cd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -159,6 +159,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; +def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def True : Predicate<"true">; def False : Predicate<"false">; @@ -3920,6 +3921,33 @@ let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in } +foreach a_type = ["s", "u"] in { + foreach b_type = ["s", "u"] in { + + def DOT4_ # a_type # b_type : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, + (!cast("int_nvvm_idp4a_" # a_type # "_" # b_type) + (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>, + Requires<[hasDotInstructions]>; + + foreach is_hi = [0, -1] in { + defvar lohi_suffix = !if(is_hi, "hi", "lo"); + + def DOT2_ # lohi_suffix # _ # a_type # b_type : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, + (!cast("int_nvvm_idp2a_" # a_type # "_" # b_type) + (i32 Int32Regs:$a), (i32 Int32Regs:$b), is_hi, (i32 Int32Regs:$c)))]>, + Requires<[hasDotInstructions]>; + } + } +} + include "NVPTXIntrinsics.td" //----------------------------------- diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 8df41913ff12ef..e47050734aae1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -90,6 +90,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } // Does SM & PTX support atomic relaxed MMIO operations ? bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } + bool hasDotInstructions() const { + return SmVersion >= 61 && PTXVersion >= 50; + } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 1ef891d1b677a2..763b6edb1c09fb 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -111,6 +111,15 @@ static cl::opt EnablePPCGenScalarMASSEntries( "(scalar) entries"), cl::Hidden); +static cl::opt + EnableGlobalMerge("ppc-global-merge", cl::Hidden, cl::init(false), + cl::desc("Enable the global merge pass")); + +static cl::opt + GlobalMergeMaxOffset("ppc-global-merge-max-offset", cl::Hidden, + cl::init(0x7fff), + cl::desc("Maximum global merge offset")); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine A(getThePPC32Target()); @@ -491,6 +500,10 @@ void PPCPassConfig::addIRPasses() { } bool PPCPassConfig::addPreISel() { + if (EnableGlobalMerge) + addPass( + createGlobalMergePass(TM, GlobalMergeMaxOffset, false, false, true)); + if (MergeStringPool && getOptLevel() != CodeGenOptLevel::None) addPass(createPPCMergeStringPoolPass()); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index fdb1ebace00107..92d00c26bd219c 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -1096,19 +1096,21 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI, bool RISCVInstructionSelector::selectSExtInreg(MachineInstr &MI, MachineIRBuilder &MIB) const { - if (!STI.isRV64()) - return false; - - const MachineOperand &Size = MI.getOperand(2); - // Only Size == 32 (i.e. shift by 32 bits) is acceptable at this point. - if (!Size.isImm() || Size.getImm() != 32) - return false; - - const MachineOperand &Src = MI.getOperand(1); - const MachineOperand &Dst = MI.getOperand(0); - // addiw rd, rs, 0 (i.e. sext.w rd, rs) - MachineInstr *NewMI = - MIB.buildInstr(RISCV::ADDIW, {Dst.getReg()}, {Src.getReg()}).addImm(0U); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSize = MI.getOperand(2).getImm(); + + MachineInstr *NewMI; + if (SrcSize == 32) { + assert(Subtarget->is64Bit() && "Unexpected extend"); + // addiw rd, rs, 0 (i.e. sext.w rd, rs) + NewMI = MIB.buildInstr(RISCV::ADDIW, {DstReg}, {SrcReg}).addImm(0U); + } else { + assert(Subtarget->hasStdExtZbb() && "Unexpected extension"); + assert((SrcSize == 8 || SrcSize == 16) && "Unexpected size"); + unsigned Opc = SrcSize == 16 ? RISCV::SEXT_H : RISCV::SEXT_B; + NewMI = MIB.buildInstr(Opc, {DstReg}, {SrcReg}); + } if (!constrainSelectedInstRegOperands(*NewMI, TII, TRI, RBI)) return false; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 74bfe8b838af77..7491af1aae2a5a 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -171,11 +171,14 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) if (ST.is64Bit()) { ExtActions.legalFor({{sXLen, s32}}); getActionDefinitionsBuilder(G_SEXT_INREG) - .customFor({sXLen}) + .customFor({s32, sXLen}) .maxScalar(0, sXLen) .lower(); } else { - getActionDefinitionsBuilder(G_SEXT_INREG).maxScalar(0, sXLen).lower(); + getActionDefinitionsBuilder(G_SEXT_INREG) + .customFor({s32}) + .maxScalar(0, sXLen) + .lower(); } ExtActions.customIf(typeIsLegalBoolVec(1, BoolVecTys, ST)) .maxScalar(0, sXLen); @@ -500,9 +503,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) // as the destination. getActionDefinitionsBuilder(G_VAARG) // TODO: Implement narrowScalar and widenScalar for G_VAARG for types - // outside the [s32, sXLen] range. - .clampScalar(0, s32, sXLen) - .lowerForCartesianProduct({s32, sXLen, p0}, {p0}); + // other than sXLen. + .clampScalar(0, sXLen, sXLen) + .lowerForCartesianProduct({sXLen, p0}, {p0}); getActionDefinitionsBuilder(G_VSCALE) .clampScalar(0, sXLen, sXLen) @@ -869,6 +872,7 @@ bool RISCVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); GISelChangeObserver &Observer = Helper.Observer; MachineFunction &MF = *MI.getParent()->getParent(); switch (MI.getOpcode()) { @@ -893,9 +897,13 @@ bool RISCVLegalizerInfo::legalizeCustom( case TargetOpcode::G_LSHR: return legalizeShlAshrLshr(MI, MIRBuilder, Observer); case TargetOpcode::G_SEXT_INREG: { - // Source size of 32 is sext.w. + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); int64_t SizeInBits = MI.getOperand(2).getImm(); - if (SizeInBits == 32) + // Source size of 32 is sext.w. + if (DstTy.getSizeInBits() == 64 && SizeInBits == 32) + return true; + + if (STI.hasStdExtZbb() && (SizeInBits == 8 || SizeInBits == 16)) return true; return Helper.lower(MI, 0, /* Unused hint type */ LLT()) == diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index ce3a37e194d545..35681c620eed51 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2978,8 +2978,8 @@ bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, if (N.getOpcode() == ISD::AND && isa(N.getOperand(1))) { SDValue N0 = N.getOperand(0); - bool LeftShift = N0.getOpcode() == ISD::SHL; - if ((LeftShift || N0.getOpcode() == ISD::SRL) && + if (bool LeftShift = N0.getOpcode() == ISD::SHL; + (LeftShift || N0.getOpcode() == ISD::SRL) && isa(N0.getOperand(1))) { uint64_t Mask = N.getConstantOperandVal(1); unsigned C2 = N0.getConstantOperandVal(1); @@ -3020,11 +3020,9 @@ bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, } } } - } - - bool LeftShift = N.getOpcode() == ISD::SHL; - if ((LeftShift || N.getOpcode() == ISD::SRL) && - isa(N.getOperand(1))) { + } else if (bool LeftShift = N.getOpcode() == ISD::SHL; + (LeftShift || N.getOpcode() == ISD::SRL) && + isa(N.getOperand(1))) { SDValue N0 = N.getOperand(0); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && isa(N0.getOperand(1))) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2b14deb479bf6f..911fa45d7173e8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1428,6 +1428,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Disable strict node mutation. IsStrictFPEnabled = true; + EnableExtLdPromotion = true; // Let the subtarget decide if a predictable select is more expensive than the // corresponding branch. This information is used in CGP/SelectOpt to decide @@ -13843,8 +13844,10 @@ performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // Fold (sext_inreg (fmv_x_anyexth X), i16) -> (fmv_x_signexth X) + // Don't do this with Zhinx. We need to explicitly sign extend the GPR. if (Src.getOpcode() == RISCVISD::FMV_X_ANYEXTH && - cast(N->getOperand(1))->getVT().bitsGE(MVT::i16)) + cast(N->getOperand(1))->getVT().bitsGE(MVT::i16) && + Subtarget.hasStdExtZfhmin()) return DAG.getNode(RISCVISD::FMV_X_SIGNEXTH, SDLoc(N), VT, Src.getOperand(0)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index 85715ca9145c35..abdd366741eb04 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -458,7 +458,6 @@ def : Pat<(any_fpextend FPR16INX:$rs1), (FCVT_S_H_INX FPR16INX:$rs1, FRM_RNE)>; // Moves (no conversion) def : Pat<(f16 (riscv_fmv_h_x GPR:$src)), (COPY_TO_REGCLASS GPR:$src, GPR)>; def : Pat<(riscv_fmv_x_anyexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src, GPR)>; -def : Pat<(riscv_fmv_x_signexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src, GPR)>; def : Pat<(fcopysign FPR32INX:$rs1, FPR16INX:$rs2), (FSGNJ_S_INX $rs1, (FCVT_S_H_INX $rs2, FRM_RNE))>; } // Predicates = [HasStdExtZhinxmin] diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 85683c62064435..781e3d7929aa43 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2000,3 +2000,35 @@ bool RISCVTTIImpl::areInlineCompatible(const Function *Caller, // target-features. return (CallerBits & CalleeBits) == CalleeBits; } + +/// See if \p I should be considered for address type promotion. We check if \p +/// I is a sext with right type and used in memory accesses. If it used in a +/// "complex" getelementptr, we allow it to be promoted without finding other +/// sext instructions that sign extended the same initial value. A getelementptr +/// is considered as "complex" if it has more than 2 operands. +bool RISCVTTIImpl::shouldConsiderAddressTypePromotion( + const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { + bool Considerable = false; + AllowPromotionWithoutCommonHeader = false; + if (!isa(&I)) + return false; + Type *ConsideredSExtType = + Type::getInt64Ty(I.getParent()->getParent()->getContext()); + if (I.getType() != ConsideredSExtType) + return false; + // See if the sext is the one with the right type and used in at least one + // GetElementPtrInst. + for (const User *U : I.users()) { + if (const GetElementPtrInst *GEPInst = dyn_cast(U)) { + Considerable = true; + // A getelementptr is considered as "complex" if it has more than 2 + // operands. We will promote a SExt used in such complex GEP as we + // expect some computation to be merged if they are done on 64 bits. + if (GEPInst->getNumOperands() > 2) { + AllowPromotionWithoutCommonHeader = true; + break; + } + } + } + return Considerable; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 9c37a4f6ec2d04..f5eca2839acd05 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -397,7 +397,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { bool shouldFoldTerminatingConditionAfterLSR() const { return true; } - + bool + shouldConsiderAddressTypePromotion(const Instruction &I, + bool &AllowPromotionWithoutCommonHeader); std::optional getMinPageSize() const { return 4096; } }; diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h index 8824954ce44819..4d6ec7664de192 100644 --- a/llvm/lib/Target/SystemZ/SystemZ.h +++ b/llvm/lib/Target/SystemZ/SystemZ.h @@ -207,6 +207,10 @@ void initializeSystemZPostRewritePass(PassRegistry &); void initializeSystemZShortenInstPass(PassRegistry &); void initializeSystemZTDCPassPass(PassRegistry &); +namespace SYSTEMZAS { +enum : unsigned { PTR32 = 1 }; +} // namespace SYSTEMZAS + } // end namespace llvm #endif diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index 387411942abaf3..25f4aacd20166f 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -142,6 +142,17 @@ inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT, return true; } +// A pointer in 64bit mode is always passed as 64bit. +inline bool CC_XPLINK64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + if (LocVT != MVT::i64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::ZExt; + } + return false; +} + inline bool CC_XPLINK64_Shadow_Reg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 136d3d25472193..b0618aafa5da6e 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -226,6 +226,8 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // Although we assign the f32 vararg to be bitcast, it will first be promoted // to an f64 within convertValVTToLocVT(). CCIfType<[f32, f64], CCIfNotFixed>>, + // Pointers are always passed in full 64-bit registers. + CCIfPtr>, // long double, can only be passed in GPR2 and GPR3, if available, // hence R2Q CCIfType<[f128], CCIfNotFixed>>, diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 383393914a1695..6f84bd6c6e4ff4 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -710,6 +710,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VACOPY, MVT::Other, Custom); setOperationAction(ISD::VAEND, MVT::Other, Expand); + if (Subtarget.isTargetzOS()) { + // Handle address space casts between mixed sized pointers. + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); + } + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); // Codes for which we want to perform some z-specific combinations. @@ -6059,6 +6065,34 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, return Op; } +static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + MVT DstVT = Op.getSimpleValueType(); + + AddrSpaceCastSDNode *N = cast(Op.getNode()); + unsigned SrcAS = N->getSrcAddressSpace(); + + assert(SrcAS != N->getDestAddressSpace() && + "addrspacecast must be between different address spaces"); + + // addrspacecast [0 <- 1] : Assinging a ptr32 value to a 64-bit pointer. + // addrspacecast [1 <- 0] : Assigining a 64-bit pointer to a ptr32 value. + if (SrcAS == SYSTEMZAS::PTR32 && DstVT == MVT::i64) { + Op = DAG.getNode(ISD::AND, dl, MVT::i32, Src, + DAG.getConstant(0x7fffffff, dl, MVT::i32)); + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op); + } else if (DstVT == MVT::i32) { + Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); + Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, + DAG.getConstant(0x7fffffff, dl, MVT::i32)); + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op); + } else { + report_fatal_error("Bad address space in addrspacecast"); + } + return Op; +} + SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -6232,6 +6266,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR); case ISD::SRA: return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR); + case ISD::ADDRSPACECAST: + return lowerAddrSpaceCast(Op, DAG); case ISD::ROTL: return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR); case ISD::IS_FPCLASS: @@ -6875,6 +6911,20 @@ SDValue SystemZTargetLowering::combineLOAD( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; EVT LdVT = N->getValueType(0); + if (auto *LN = dyn_cast(N)) { + if (LN->getAddressSpace() == SYSTEMZAS::PTR32) { + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT LoadNodeVT = LN->getBasePtr().getSimpleValueType(); + if (PtrVT != LoadNodeVT) { + SDLoc DL(LN); + SDValue AddrSpaceCast = DAG.getAddrSpaceCast( + DL, PtrVT, LN->getBasePtr(), SYSTEMZAS::PTR32, 0); + return DAG.getExtLoad(LN->getExtensionType(), DL, LN->getValueType(0), + LN->getChain(), AddrSpaceCast, LN->getMemoryVT(), + LN->getMemOperand()); + } + } + } SDLoc DL(N); // Replace a 128-bit load that is used solely to move its value into GPRs @@ -7042,6 +7092,20 @@ SDValue SystemZTargetLowering::combineSTORE( auto *SN = cast(N); auto &Op1 = N->getOperand(1); EVT MemVT = SN->getMemoryVT(); + + if (SN->getAddressSpace() == SYSTEMZAS::PTR32) { + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT StoreNodeVT = SN->getBasePtr().getSimpleValueType(); + if (PtrVT != StoreNodeVT) { + SDLoc DL(SN); + SDValue AddrSpaceCast = DAG.getAddrSpaceCast(DL, PtrVT, SN->getBasePtr(), + SYSTEMZAS::PTR32, 0); + return DAG.getStore(SN->getChain(), DL, SN->getValue(), AddrSpaceCast, + SN->getPointerInfo(), SN->getOriginalAlign(), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + } + } + // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better // for the extraction to be done on a vMiN value, so that we can use VSTE. // If X has wider elements then convert it to: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 21c1556d1d8ed2..ad1f2dc532d1c2 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -30,7 +30,6 @@ void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm(); unsigned Opc = MI->getOpcode(); - bool IsCMPCCXADD = X86::isCMPCCXADD(Opc); bool IsCCMPOrCTEST = X86::isCCMPCC(Opc) || X86::isCTESTCC(Opc); // clang-format off @@ -39,19 +38,19 @@ void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op, case 0: O << "o"; break; case 1: O << "no"; break; case 2: O << "b"; break; - case 3: O << (IsCMPCCXADD ? "nb" : "ae"); break; - case 4: O << (IsCMPCCXADD ? "z" : "e"); break; - case 5: O << (IsCMPCCXADD ? "nz" : "ne"); break; + case 3: O << "ae"; break; + case 4: O << "e"; break; + case 5: O << "ne"; break; case 6: O << "be"; break; - case 7: O << (IsCMPCCXADD ? "nbe" : "a"); break; + case 7: O << "a"; break; case 8: O << "s"; break; case 9: O << "ns"; break; case 0xa: O << (IsCCMPOrCTEST ? "t" : "p"); break; case 0xb: O << (IsCCMPOrCTEST ? "f" : "np"); break; case 0xc: O << "l"; break; - case 0xd: O << (IsCMPCCXADD ? "nl" : "ge"); break; + case 0xd: O << "ge"; break; case 0xe: O << "le"; break; - case 0xf: O << (IsCMPCCXADD ? "nle" : "g"); break; + case 0xf: O << "g"; break; } // clang-format on } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2759252693f9f8..11c9a992cbdee9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56382,8 +56382,12 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; for (int M : SubMask) { if (0 <= M) { + int Src = M < NumSrcElts ? 0 : 2; M += M < NumSrcElts ? 0 : NumSrcElts; - M += i * NumSrcElts; + + // Reference the lowest sub if they upper sub is the same. + if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src)) + M += i * NumSrcElts; } ConcatMask.push_back(M); } @@ -57140,6 +57144,11 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, } auto IsExtractFree = [](SDValue V) { + if (V.hasOneUse()) { + V = peekThroughOneUseBitcasts(V); + if (V.getOpcode() == ISD::LOAD) + return true; + } V = peekThroughBitcasts(V); if (ISD::isBuildVectorOfConstantSDNodes(V.getNode())) return true; @@ -57204,24 +57213,49 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); return DAG.getNode(InOpcode, DL, VT, Ext); } - if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ || - InOpcode == X86ISD::PCMPGT) && - (IsExtractFree(InVec.getOperand(0)) || - IsExtractFree(InVec.getOperand(1))) && - SizeInBits == 128) { - SDValue Ext0 = - extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); - SDValue Ext1 = - extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits); - if (InOpcode == X86ISD::CMPP) - return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2)); - return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1); - } - if (InOpcode == X86ISD::MOVDDUP && - (SizeInBits == 128 || SizeInBits == 256)) { - SDValue Ext0 = - extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); - return DAG.getNode(InOpcode, DL, VT, Ext0); + + if (SizeInBits == 128 || SizeInBits == 256) { + switch (InOpcode) { + case X86ISD::MOVDDUP: + return DAG.getNode( + InOpcode, DL, VT, + extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits)); + case X86ISD::PCMPEQ: + case X86ISD::PCMPGT: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: + if (IsExtractFree(InVec.getOperand(0)) || + IsExtractFree(InVec.getOperand(1))) + return DAG.getNode(InOpcode, DL, VT, + extractSubVector(InVec.getOperand(0), IdxVal, DAG, + DL, SizeInBits), + extractSubVector(InVec.getOperand(1), IdxVal, DAG, + DL, SizeInBits)); + break; + case X86ISD::CMPP: + if (IsExtractFree(InVec.getOperand(0)) || + IsExtractFree(InVec.getOperand(1))) + return DAG.getNode(InOpcode, DL, VT, + extractSubVector(InVec.getOperand(0), IdxVal, DAG, + DL, SizeInBits), + extractSubVector(InVec.getOperand(1), IdxVal, DAG, + DL, SizeInBits), + InVec.getOperand(2)); + break; + case X86ISD::BLENDI: + if (IsExtractFree(InVec.getOperand(0)) || + IsExtractFree(InVec.getOperand(1))) { + uint64_t M = InVec.getConstantOperandVal(2) & 255; + M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal); + return DAG.getNode(InOpcode, DL, VT, + extractSubVector(InVec.getOperand(0), IdxVal, DAG, + DL, SizeInBits), + extractSubVector(InVec.getOperand(1), IdxVal, DAG, + DL, SizeInBits), + DAG.getTargetConstant(M, DL, MVT::i8)); + } + break; + } } } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 1e609a84673a3c..ce6b0f9c07dc8d 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2450,10 +2450,16 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, }(); assert(Mask && "Missing call preserved mask for calling convention"); - if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFrameRegister(MF))) + if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFramePtr())) { X86Info->setFPClobberedByCall(true); - if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister())) + if (CLI.CB && isa(CLI.CB)) + X86Info->setFPClobberedByInvoke(true); + } + if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister())) { X86Info->setBPClobberedByCall(true); + if (CLI.CB && isa(CLI.CB)) + X86Info->setBPClobberedByInvoke(true); + } // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 13d57c2fa9dfbc..24371369d4a452 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -173,6 +173,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { // True if a function clobbers FP/BP according to its calling convention. bool FPClobberedByCall = false; bool BPClobberedByCall = false; + bool FPClobberedByInvoke = false; + bool BPClobberedByInvoke = false; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers @@ -338,6 +340,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { bool getBPClobberedByCall() const { return BPClobberedByCall; } void setBPClobberedByCall(bool C) { BPClobberedByCall = C; } + + bool getFPClobberedByInvoke() const { return FPClobberedByInvoke; } + void setFPClobberedByInvoke(bool C) { FPClobberedByInvoke = C; } + + bool getBPClobberedByInvoke() const { return BPClobberedByInvoke; } + void setBPClobberedByInvoke(bool C) { BPClobberedByInvoke = C; } }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 3376367cc76b00..638eb1c4f11e41 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/MC/MCContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" @@ -565,18 +566,22 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { + if (MF.getInfo()->getFPClobberedByInvoke()) + MF.getContext().reportError( + SMLoc(), + "Frame pointer clobbered by function invoke is not supported."); + for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP)) Reserved.set(SubReg); } // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { - CallingConv::ID CC = MF.getFunction().getCallingConv(); - const uint32_t *RegMask = getCallPreservedMask(MF, CC); - if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) - report_fatal_error( - "Stack realignment in presence of dynamic allocas is not supported with" - "this calling convention."); + if (MF.getInfo()->getBPClobberedByInvoke()) + MF.getContext().reportError(SMLoc(), + "Stack realignment in presence of dynamic " + "allocas is not supported with " + "this calling convention."); Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr)) diff --git a/llvm/lib/TextAPI/RecordsSlice.cpp b/llvm/lib/TextAPI/RecordsSlice.cpp index 111a1fa6eaf43b..04c48eaa628ea4 100644 --- a/llvm/lib/TextAPI/RecordsSlice.cpp +++ b/llvm/lib/TextAPI/RecordsSlice.cpp @@ -243,16 +243,18 @@ ObjCCategoryRecord *RecordsSlice::addObjCCategory(StringRef ClassToExtend, std::vector ObjCContainerRecord::getObjCIVars() const { std::vector Records; - llvm::for_each(IVars, - [&](auto &Record) { Records.push_back(Record.second.get()); }); + Records.reserve(IVars.size()); + for (const auto &Record : IVars) + Records.push_back(Record.second.get()); return Records; } std::vector ObjCInterfaceRecord::getObjCCategories() const { std::vector Records; - llvm::for_each(Categories, - [&](auto &Record) { Records.push_back(Record.second); }); + Records.reserve(Categories.size()); + for (const auto &Record : Categories) + Records.push_back(Record.second); return Records; } diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 910c0aeacc42e0..38b61b6a88357c 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -3836,7 +3836,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, if (MaxSpecializationPerCB.getNumOccurrences()) { AC.IndirectCalleeSpecializationCallback = [&](Attributor &, const AbstractAttribute &AA, CallBase &CB, - Function &Callee) { + Function &Callee, unsigned) { if (MaxSpecializationPerCB == 0) return false; auto &Set = IndirectCalleeTrackingMap[&CB]; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 8ece5bbdfc77e1..41a7fc0870cf0a 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12357,7 +12357,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo { SmallVector SkippedAssumedCallees; SmallVector> NewCalls; for (Function *NewCallee : AssumedCallees) { - if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) { + if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee, + AssumedCallees.size())) { SkippedAssumedCallees.push_back(NewCallee); SpecializedForAllCallees = false; continue; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 2b0347073b7d1f..10488ecb747a48 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3077,6 +3077,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC)); } + if (ICmpInst::isUnsigned(Pred) && Add->hasNoSignedWrap() && + C.isNonNegative() && (C - *C2).isNonNegative() && + computeConstantRange(X, /*ForSigned=*/true).add(*C2).isAllNonNegative()) + return new ICmpInst(ICmpInst::getSignedPredicate(Pred), X, + ConstantInt::get(Ty, C - *C2)); + auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2); const APInt &Upper = CR.getUpper(); const APInt &Lower = CR.getLower(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index d6ba12465bb328..9b10cbba84075a 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -8,6 +8,7 @@ // #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h" +#include "llvm/Analysis/CtxProfAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/DiagnosticInfo.h" @@ -16,6 +17,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CommandLine.h" #include @@ -223,8 +225,8 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { assert(Mark->getIndex()->isZero()); IRBuilder<> Builder(Mark); - // FIXME(mtrofin): use InstrProfSymtab::getCanonicalName - Guid = Builder.getInt64(F.getGUID()); + + Guid = Builder.getInt64(AssignGUIDPass::getGUID(F)); // The type of the context of this function is now knowable since we have // NrCallsites and NrCounters. We delcare it here because it's more // convenient - we have the Builder. diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ed9c1828ce06a2..6e1bb892f62018 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -71,6 +71,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp index b203970ef9c5a0..f9bf419fb02252 100644 --- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp @@ -25,10 +25,11 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); - LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr); + LoadInst *Orig = + Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign()); Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); Value *Res = Builder.CreateSelect(Equal, Val, Orig); - Builder.CreateStore(Res, Ptr); + Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign()); Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0); Res = Builder.CreateInsertValue(Res, Equal, 1); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f5337b11edc977..fdf8f7042c4fb8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9363,46 +9363,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { State.set(this, Res, 0); } -void VPWidenStoreRecipe::execute(VPTransformState &State) { - auto *SI = cast(&Ingredient); - - VPValue *StoredVPValue = getStoredValue(); - bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - - auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Instruction *NewSI = nullptr; - Value *Mask = nullptr; - if (auto *VPMask = getMask()) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse - // of a null all-one mask is a null mask. - Mask = State.get(VPMask, Part); - if (isReverse()) - Mask = Builder.CreateVectorReverse(Mask, "reverse"); - } - - Value *StoredVal = State.get(StoredVPValue, Part); - if (isReverse()) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); - if (CreateScatter) - NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); - else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); - else - NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); - State.addMetadata(NewSI, SI); - } -} - void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " "explicit vector length."); @@ -10176,22 +10136,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { return true; } -LoopVectorizeResult LoopVectorizePass::runImpl( - Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, - DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { - SE = &SE_; - LI = &LI_; - TTI = &TTI_; - DT = &DT_; - BFI = BFI_; - TLI = TLI_; - AC = &AC_; - LAIs = &LAIs_; - DB = &DB_; - ORE = &ORE_; - PSI = PSI_; +LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { // Don't attempt if // 1. the target claims to have no vector registers, and @@ -10251,53 +10196,51 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { - auto &LI = AM.getResult(F); - // There are no loops in the function. Return before computing other expensive - // analyses. - if (LI.empty()) - return PreservedAnalyses::all(); - auto &SE = AM.getResult(F); - auto &TTI = AM.getResult(F); - auto &DT = AM.getResult(F); - auto &TLI = AM.getResult(F); - auto &AC = AM.getResult(F); - auto &DB = AM.getResult(F); - auto &ORE = AM.getResult(F); - - LoopAccessInfoManager &LAIs = AM.getResult(F); - auto &MAMProxy = AM.getResult(F); - ProfileSummaryInfo *PSI = - MAMProxy.getCachedResult(*F.getParent()); - BlockFrequencyInfo *BFI = nullptr; - if (PSI && PSI->hasProfileSummary()) - BFI = &AM.getResult(F); - LoopVectorizeResult Result = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); - if (!Result.MadeAnyChange) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - - if (isAssignmentTrackingEnabled(*F.getParent())) { - for (auto &BB : F) - RemoveRedundantDbgInstrs(&BB); - } - - PA.preserve(); - PA.preserve(); - PA.preserve(); - PA.preserve(); - - if (Result.MadeCFGChange) { - // Making CFG changes likely means a loop got vectorized. Indicate that - // extra simplification passes should be run. - // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only - // be run if runtime checks have been added. - AM.getResult(F); - PA.preserve(); - } else { - PA.preserveSet(); - } - return PA; + LI = &AM.getResult(F); + // There are no loops in the function. Return before computing other + // expensive analyses. + if (LI->empty()) + return PreservedAnalyses::all(); + SE = &AM.getResult(F); + TTI = &AM.getResult(F); + DT = &AM.getResult(F); + TLI = &AM.getResult(F); + AC = &AM.getResult(F); + DB = &AM.getResult(F); + ORE = &AM.getResult(F); + LAIs = &AM.getResult(F); + + auto &MAMProxy = AM.getResult(F); + PSI = MAMProxy.getCachedResult(*F.getParent()); + BFI = nullptr; + if (PSI && PSI->hasProfileSummary()) + BFI = &AM.getResult(F); + LoopVectorizeResult Result = runImpl(F); + if (!Result.MadeAnyChange) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + + if (isAssignmentTrackingEnabled(*F.getParent())) { + for (auto &BB : F) + RemoveRedundantDbgInstrs(&BB); + } + + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + + if (Result.MadeCFGChange) { + // Making CFG changes likely means a loop got vectorized. Indicate that + // extra simplification passes should be run. + // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only + // be run if runtime checks have been added. + AM.getResult(F); + PA.preserve(); + } else { + PA.preserveSet(); + } + return PA; } void LoopVectorizePass::printPipeline( diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index feffd9ae3c99b7..81c4193cfe1081 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1243,13 +1243,11 @@ class BoUpSLP { StridedVectorize }; - using ValueList = SmallVector; - using InstrList = SmallVector; - using ValueSet = SmallPtrSet; - using StoreList = SmallVector; + using ValueList = SmallVector; + using ValueSet = SmallPtrSet; using ExtraValueToDebugLocsMap = MapVector>; - using OrdersType = SmallVector; + using OrdersType = SmallVector; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, @@ -1471,7 +1469,7 @@ class BoUpSLP { /// \param TryRecursiveCheck used to check if long masked gather can be /// represented as a serie of loads/insert subvector, if profitable. LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, - SmallVectorImpl &Order, + OrdersType &Order, SmallVectorImpl &PointerOps, bool TryRecursiveCheck = true) const; @@ -2840,7 +2838,7 @@ class BoUpSLP { /// \param ResizeAllowed indicates whether it is allowed to handle subvector /// extract order. bool canReuseExtract(ArrayRef VL, Value *OpValue, - SmallVectorImpl &CurrentOrder, + OrdersType &CurrentOrder, bool ResizeAllowed = false) const; /// Vectorize a single entry in the tree. @@ -3084,10 +3082,10 @@ class BoUpSLP { CombinedOpcode CombinedOp = NotCombinedOp; /// Does this sequence require some shuffling? - SmallVector ReuseShuffleIndices; + SmallVector ReuseShuffleIndices; /// Does this entry require reordering? - SmallVector ReorderIndices; + OrdersType ReorderIndices; /// Points back to the VectorizableTree. /// @@ -3108,7 +3106,7 @@ class BoUpSLP { /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the /// reordering of operands during buildTree_rec() and vectorizeTree(). - SmallVector Operands; + SmallVector Operands; /// The main/alternate instruction. Instruction *MainOp = nullptr; @@ -3716,13 +3714,13 @@ class BoUpSLP { /// The dependent memory instructions. /// This list is derived on demand in calculateDependencies(). - SmallVector MemoryDependencies; + SmallVector MemoryDependencies; /// List of instructions which this instruction could be control dependent /// on. Allowing such nodes to be scheduled below this one could introduce /// a runtime fault which didn't exist in the original program. /// ex: this is a load or udiv following a readonly call which inf loops - SmallVector ControlDependencies; + SmallVector ControlDependencies; /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. @@ -4300,12 +4298,12 @@ static void reorderReuses(SmallVectorImpl &Reuses, ArrayRef Mask) { /// the original order of the scalars. Procedure transforms the provided order /// in accordance with the given \p Mask. If the resulting \p Order is just an /// identity order, \p Order is cleared. -static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask, +static void reorderOrder(BoUpSLP::OrdersType &Order, ArrayRef Mask, bool BottomOrder = false) { assert(!Mask.empty() && "Expected non-empty mask."); unsigned Sz = Mask.size(); if (BottomOrder) { - SmallVector PrevOrder; + BoUpSLP::OrdersType PrevOrder; if (Order.empty()) { PrevOrder.resize(Sz); std::iota(PrevOrder.begin(), PrevOrder.end(), 0); @@ -4695,7 +4693,7 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, } BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( - ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, + ArrayRef VL, const Value *VL0, OrdersType &Order, SmallVectorImpl &PointerOps, bool TryRecursiveCheck) const { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller @@ -4823,7 +4821,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF, ++VectorizedCnt) { ArrayRef Slice = VL.slice(Cnt, VF); - SmallVector Order; + OrdersType Order; SmallVector PointerOps; LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, @@ -5397,7 +5395,7 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const { TE.ReorderIndices.clear(); // Try to improve gathered nodes with clustered reuses, if possible. ArrayRef Slice = ArrayRef(NewMask).slice(0, Sz); - SmallVector NewOrder(Slice); + OrdersType NewOrder(Slice); inversePermutation(NewOrder, NewMask); reorderScalars(TE.Scalars, NewMask); // Fill the reuses mask with the identity submasks. @@ -7717,7 +7715,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { } bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, - SmallVectorImpl &CurrentOrder, + OrdersType &CurrentOrder, bool ResizeAllowed) const { const auto *It = find_if(VL, IsaPred); assert(It != VL.end() && "Expected at least one extract instruction."); @@ -13096,12 +13094,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. - SmallVector Mask(E->Scalars.size(), PoisonMaskElem); - for (auto [I, V] : enumerate(E->Scalars)) { + SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); + for (auto [I, V] : enumerate(GatheredScalars)) { if (!isa(V)) Mask[I] = I; } - Value *BV = ShuffleBuilder.gather(E->Scalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } @@ -16398,10 +16396,10 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, } R.reorderTopToBottom(); R.reorderBottomToTop(); + R.transformNodes(); R.buildExternalUses(); R.computeMinimumValueSizes(); - R.transformNodes(); Size = R.getTreeSize(); if (S.getOpcode() == Instruction::Load) @@ -16968,10 +16966,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, R.reorderBottomToTop( /*IgnoreReorder=*/!isa(Ops.front()) && !R.doesRootHaveInTreeUses()); + R.transformNodes(); R.buildExternalUses(); R.computeMinimumValueSizes(); - R.transformNodes(); InstructionCost Cost = R.getTreeCost(); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -17906,10 +17904,10 @@ class HorizontalReduction { for (Value *RdxVal : VL) if (RequiredExtract.contains(RdxVal)) LocalExternallyUsedValues[RdxVal]; + V.transformNodes(); V.buildExternalUses(LocalExternallyUsedValues); V.computeMinimumValueSizes(); - V.transformNodes(); // Estimate cost. InstructionCost TreeCost = V.getTreeCost(VL); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 911b2fe9e9a1eb..bc57ea4d52471e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2066,7 +2066,49 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, O << " = vp.load "; printOperands(O, SlotTracker); } +#endif + +void VPWidenStoreRecipe::execute(VPTransformState &State) { + auto *SI = cast(&Ingredient); + + VPValue *StoredVPValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, Part); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } + + Value *StoredVal = State.get(StoredVPValue, Part); + if (isReverse()) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); + if (CreateScatter) + NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); + else if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); + State.addMetadata(NewSI, SI); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN store "; diff --git a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll new file mode 100644 index 00000000000000..0cdf82bd96efcb --- /dev/null +++ b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll @@ -0,0 +1,119 @@ +; REQUIRES: x86_64-linux +; +; RUN: rm -rf %t +; RUN: split-file %s %t +; +; Test that the GUID metadata survives through thinlink. +; +; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata +; +; RUN: opt -module-summary -passes='thinlto-pre-link' -use-ctx-profile=%t/profile.ctxprofdata -o %t/m1.bc %t/m1.ll +; RUN: opt -module-summary -passes='thinlto-pre-link' -use-ctx-profile=%t/profile.ctxprofdata -o %t/m2.bc %t/m2.ll +; +; RUN: rm -rf %t/postlink +; RUN: mkdir %t/postlink +; +; +; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc -o %t/ -thinlto-distributed-indexes \ +; RUN: -use-ctx-profile=%t/profile.ctxprofdata \ +; RUN: -r %t/m1.bc,f1,plx \ +; RUN: -r %t/m2.bc,f1 \ +; RUN: -r %t/m2.bc,entrypoint,plx +; RUN: opt --passes='function-import,require,print' \ +; RUN: -summary-file=%t/m2.bc.thinlto.bc -use-ctx-profile=%t/profile.ctxprofdata %t/m2.bc \ +; RUN: -S -o %t/m2.post.ll 2> %t/profile.txt +; RUN: diff %t/expected.txt %t/profile.txt +;--- m1.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +source_filename = "random_path/m1.cc" + +define private void @f2() #0 !guid !0 { + ret void +} + +define void @f1() #0 { + call void @f2() + ret void +} + +attributes #0 = { noinline } +!0 = !{ i64 3087265239403591524 } + +;--- m2.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +source_filename = "random_path/m2.cc" + +declare void @f1() + +define void @entrypoint() { + call void @f1() + ret void +} +;--- profile.json +[ + { + "Callsites": [ + [ + { + "Callsites": [ + [ + { + "Counters": [ + 10 + ], + "Guid": 3087265239403591524 + } + ] + ], + "Counters": [ + 7 + ], + "Guid": 2072045998141807037 + } + ] + ], + "Counters": [ + 1 + ], + "Guid": 10507721908651011566 + } +] +;--- expected.txt +Function Info: +10507721908651011566 : entrypoint. MaxCounterID: 1. MaxCallsiteID: 1 +3087265239403591524 : f2.llvm.0. MaxCounterID: 1. MaxCallsiteID: 0 +2072045998141807037 : f1. MaxCounterID: 1. MaxCallsiteID: 1 + +Current Profile: +[ + { + "Callsites": [ + [ + { + "Callsites": [ + [ + { + "Counters": [ + 10 + ], + "Guid": 3087265239403591524 + } + ] + ], + "Counters": [ + 7 + ], + "Guid": 2072045998141807037 + } + ] + ], + "Counters": [ + 1 + ], + "Guid": 10507721908651011566 + } +] diff --git a/llvm/test/Analysis/CtxProfAnalysis/load.ll b/llvm/test/Analysis/CtxProfAnalysis/load.ll index 9cd78cfef187ba..69806e334aaec9 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/load.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/load.ll @@ -1,16 +1,22 @@ ; REQUIRES: x86_64-linux - +; +; RUN: rm -rf %t ; RUN: split-file %s %t ; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata ; RUN: not opt -passes='require,print' \ -; RUN: %t/empty.ll -S 2>&1 | FileCheck %s --check-prefix=NO-FILE +; RUN: %t/example.ll -S 2>&1 | FileCheck %s --check-prefix=NO-FILE ; RUN: not opt -passes='require,print' \ -; RUN: -use-ctx-profile=does_not_exist.ctxprofdata %t/empty.ll -S 2>&1 | FileCheck %s --check-prefix=NO-FILE +; RUN: -use-ctx-profile=does_not_exist.ctxprofdata %t/example.ll -S 2>&1 | FileCheck %s --check-prefix=NO-FILE +; RUN: opt -module-summary -passes='thinlto-pre-link' \ +; RUN: -use-ctx-profile=%t/profile.ctxprofdata %t/example.ll -S -o %t/prelink.ll + +; RUN: opt -module-summary -passes='thinlto-pre-link' -use-ctx-profile=%t/profile.ctxprofdata \ +; RUN: %t/example.ll -S -o %t/prelink.ll ; RUN: opt -passes='require,print' \ -; RUN: -use-ctx-profile=%t/profile.ctxprofdata %t/empty.ll -S 2> %t/output.json -; RUN: diff %t/profile.json %t/output.json +; RUN: -use-ctx-profile=%t/profile.ctxprofdata %t/prelink.ll -S 2> %t/output.txt +; RUN: diff %t/expected-profile-output.txt %t/output.txt ; NO-FILE: error: could not open contextual profile file ; @@ -18,41 +24,104 @@ ; output it from opt. ;--- profile.json [ + { + "Counters": [ + 9 + ], + "Guid": 12341 + }, + { + "Counters": [ + 5 + ], + "Guid": 12074870348631550642 + }, { "Callsites": [ - [], [ { "Counters": [ - 4, - 5 + 6, + 7 ], - "Guid": 2000 - }, + "Guid": 728453322856651412 + } + ] + ], + "Counters": [ + 1 + ], + "Guid": 11872291593386833696 + } +] +;--- expected-profile-output.txt +Function Info: +4909520559318251808 : an_entrypoint. MaxCounterID: 2. MaxCallsiteID: 1 +12074870348631550642 : another_entrypoint_no_callees. MaxCounterID: 1. MaxCallsiteID: 0 +11872291593386833696 : foo. MaxCounterID: 1. MaxCallsiteID: 1 + +Current Profile: +[ + { + "Callsites": [ + [ { "Counters": [ 6, - 7, - 8 + 7 ], - "Guid": 18446744073709551613 + "Guid": 728453322856651412 } ] ], "Counters": [ - 1, - 2, - 3 + 1 ], - "Guid": 1000 + "Guid": 11872291593386833696 }, { "Counters": [ - 5, - 9, - 10 + 5 ], - "Guid": 18446744073709551612 + "Guid": 12074870348631550642 } ] -;--- empty.ll +;--- example.ll +declare void @bar() + +define private void @foo(i32 %a, ptr %fct) #0 !guid !0 { + %t = icmp eq i32 %a, 0 + br i1 %t, label %yes, label %no +yes: + call void %fct(i32 %a) + br label %exit +no: + call void @bar() + br label %exit +exit: + ret void +} + +define void @an_entrypoint(i32 %a) { + %t = icmp eq i32 %a, 0 + br i1 %t, label %yes, label %no + +yes: + call void @foo(i32 1, ptr null) + ret void +no: + ret void +} + +define void @another_entrypoint_no_callees(i32 %a) { + %t = icmp eq i32 %a, 0 + br i1 %t, label %yes, label %no + +yes: + ret void +no: + ret void +} + +attributes #0 = { noinline } +!0 = !{ i64 11872291593386833696 } \ No newline at end of file diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll new file mode 100644 index 00000000000000..4349adb8ef8ebb --- /dev/null +++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll @@ -0,0 +1,126 @@ +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +@G = external constant <4 x float>, align 4 + +define void @test_typedbuffer() { + ; RWBuffer Buf : register(u5, space3) + %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK: Binding for %typed0 + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 0 + ; CHECK: Space: 3 + ; CHECK: Lower Bound: 5 + ; CHECK: Size: 1 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: f32 + ; CHECK: Element Count: 4 + + ; RWBuffer Buf : register(u7, space2) + %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t( + i32 2, i32 7, i32 1, i32 0, i1 false) + ; CHECK: Binding for %typed1 + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 1 + ; CHECK: Space: 2 + ; CHECK: Lower Bound: 7 + ; CHECK: Size: 1 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: i32 + ; CHECK: Element Count: 1 + + ; Buffer Buf[24] : register(t3, space5) + %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t( + i32 2, i32 7, i32 24, i32 0, i1 false) + ; CHECK: Binding for %typed2 + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 0 + ; CHECK: Space: 2 + ; CHECK: Lower Bound: 7 + ; CHECK: Size: 24 + ; CHECK: Class: SRV + ; CHECK: Kind: TypedBuffer + ; CHECK: Element Type: u32 + ; CHECK: Element Count: 4 + + ret void +} + +define void @test_structbuffer() { + ; struct S { float4 a; uint4 b; }; + ; StructuredBuffer Buf : register(t2, space4) + %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( + i32 4, i32 2, i32 1, i32 0, i1 false) + ; CHECK: Binding for %struct0 + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 1 + ; CHECK: Space: 4 + ; CHECK: Lower Bound: 2 + ; CHECK: Size: 1 + ; CHECK: Class: SRV + ; CHECK: Kind: StructuredBuffer + ; CHECK: Buffer Stride: 32 + ; CHECK: Alignment: 4 + + ret void +} + +define void @test_bytebuffer() { + ; ByteAddressBuffer Buf : register(t8, space1) + %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( + i32 1, i32 8, i32 1, i32 0, i1 false) + ; CHECK: Binding for %byteaddr0 + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 2 + ; CHECK: Space: 1 + ; CHECK: Lower Bound: 8 + ; CHECK: Size: 1 + ; CHECK: Class: SRV + ; CHECK: Kind: RawBuffer + + ret void +} + +; Note: We need declarations for each handle.fromBinding in the same +; order as they appear in source to ensure that we can put our CHECK +; lines along side the thing they're checking. +declare target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t( + i32, i32, i32, i32, i1) #0 +declare target("dx.TypedBuffer", i32, 1, 0, 1) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_1t( + i32, i32, i32, i32, i1) #0 +declare target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_0_0_0t( + i32, i32, i32, i32, i1) #0 +declare target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( + i32, i32, i32, i32, i1) #0 +declare target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( + i32, i32, i32, i32, i1) #0 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/Assembler/datalayout-invalid-function-ptr-alignment.ll b/llvm/test/Assembler/datalayout-invalid-function-ptr-alignment.ll deleted file mode 100644 index 7c1e070c292d18..00000000000000 --- a/llvm/test/Assembler/datalayout-invalid-function-ptr-alignment.ll +++ /dev/null @@ -1,5 +0,0 @@ -; RUN: not llvm-as %s 2>&1 | FileCheck %s - -; CHECK: error: Alignment is neither 0 nor a power of 2 - -target datalayout = "Fi24" diff --git a/llvm/test/Assembler/datalayout-invalid-i8-alignment.ll b/llvm/test/Assembler/datalayout-invalid-i8-alignment.ll deleted file mode 100644 index e12cfce0309746..00000000000000 --- a/llvm/test/Assembler/datalayout-invalid-i8-alignment.ll +++ /dev/null @@ -1,5 +0,0 @@ -; RUN: not llvm-as %s 2>&1 | FileCheck %s - -; CHECK: error: Invalid ABI alignment, i8 must be naturally aligned - -target datalayout = "i8:16" diff --git a/llvm/test/Assembler/datalayout-invalid-stack-natural-alignment.ll b/llvm/test/Assembler/datalayout-invalid-stack-natural-alignment.ll deleted file mode 100644 index 1ccfb7832a50cc..00000000000000 --- a/llvm/test/Assembler/datalayout-invalid-stack-natural-alignment.ll +++ /dev/null @@ -1,5 +0,0 @@ -; RUN: not llvm-as %s 2>&1 | FileCheck %s - -; CHECK: error: Alignment is neither 0 nor a power of 2 - -target datalayout = "S24" diff --git a/llvm/test/Assembler/invalid-datalayout-alloca-addrspace.ll b/llvm/test/Assembler/invalid-datalayout-alloca-addrspace.ll deleted file mode 100644 index f0407da73e4fc2..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout-alloca-addrspace.ll +++ /dev/null @@ -1,4 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "A16777216" -; CHECK: Invalid address space, must be a 24-bit integer diff --git a/llvm/test/Assembler/invalid-datalayout-globals-addrspace.ll b/llvm/test/Assembler/invalid-datalayout-globals-addrspace.ll deleted file mode 100644 index 19bf77db329d2c..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout-globals-addrspace.ll +++ /dev/null @@ -1,4 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -; CHECK: Invalid address space, must be a 24-bit integer -target datalayout = "G16777216" diff --git a/llvm/test/Assembler/invalid-datalayout-index-size.ll b/llvm/test/Assembler/invalid-datalayout-index-size.ll deleted file mode 100644 index dc608cdd56a040..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout-index-size.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "p:64:64:64:128" -; CHECK: Index width cannot be larger than pointer width diff --git a/llvm/test/Assembler/invalid-datalayout-program-addrspace.ll b/llvm/test/Assembler/invalid-datalayout-program-addrspace.ll deleted file mode 100644 index e636b75dee4d04..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout-program-addrspace.ll +++ /dev/null @@ -1,4 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -; CHECK: Invalid address space, must be a 24-bit integer -target datalayout = "P16777216" diff --git a/llvm/test/Assembler/invalid-datalayout1.ll b/llvm/test/Assembler/invalid-datalayout1.ll deleted file mode 100644 index d1befdcdf294d5..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout1.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "^" -; CHECK: Unknown specifier in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout10.ll b/llvm/test/Assembler/invalid-datalayout10.ll deleted file mode 100644 index 9f19688f852b4a..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout10.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "m" -; CHECK: Expected mangling specifier in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout11.ll b/llvm/test/Assembler/invalid-datalayout11.ll deleted file mode 100644 index f8fed8ff9ff339..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout11.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "m." -; CHECK: Unexpected trailing characters after mangling specifier in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout12.ll b/llvm/test/Assembler/invalid-datalayout12.ll deleted file mode 100644 index d79c196baab16f..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout12.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "f" -; CHECK: Missing alignment specification in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout13.ll b/llvm/test/Assembler/invalid-datalayout13.ll deleted file mode 100644 index 5ac719dbb7a9c0..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout13.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = ":32" -; CHECK: Expected token before separator in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout14.ll b/llvm/test/Assembler/invalid-datalayout14.ll deleted file mode 100644 index 84634b52a146ca..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout14.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "i64:64:16" -; CHECK: Preferred alignment cannot be less than the ABI alignment diff --git a/llvm/test/Assembler/invalid-datalayout15.ll b/llvm/test/Assembler/invalid-datalayout15.ll deleted file mode 100644 index ea240b73fd25f2..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout15.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "i64:16:16777216" -; CHECK: Invalid preferred alignment, must be a 16bit integer diff --git a/llvm/test/Assembler/invalid-datalayout16.ll b/llvm/test/Assembler/invalid-datalayout16.ll deleted file mode 100644 index 0dd1abb629b6fc..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout16.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "i64:16777216:16777216" -; CHECK: Invalid ABI alignment, must be a 16bit integer diff --git a/llvm/test/Assembler/invalid-datalayout17.ll b/llvm/test/Assembler/invalid-datalayout17.ll deleted file mode 100644 index b7eab74ad2a8ca..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout17.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "i16777216:16:16" -; CHECK: Invalid bit width, must be a 24-bit integer diff --git a/llvm/test/Assembler/invalid-datalayout18.ll b/llvm/test/Assembler/invalid-datalayout18.ll deleted file mode 100644 index b9956f98c9c6dc..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout18.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "p:32:32:16" -; CHECK: Preferred alignment cannot be less than the ABI alignment diff --git a/llvm/test/Assembler/invalid-datalayout19.ll b/llvm/test/Assembler/invalid-datalayout19.ll deleted file mode 100644 index fc0fc468520928..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout19.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "p:0:32:32" - -; CHECK: Invalid pointer size of 0 bytes - diff --git a/llvm/test/Assembler/invalid-datalayout2.ll b/llvm/test/Assembler/invalid-datalayout2.ll deleted file mode 100644 index a435612bf85459..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout2.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "m:v" -; CHECK: Unknown mangling in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout20.ll b/llvm/test/Assembler/invalid-datalayout20.ll deleted file mode 100644 index a9ac1d7fe0983a..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout20.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "p:64:24:64" - -; CHECK: Pointer ABI alignment must be a power of 2 - diff --git a/llvm/test/Assembler/invalid-datalayout21.ll b/llvm/test/Assembler/invalid-datalayout21.ll deleted file mode 100644 index a39d1d7a14a86b..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout21.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "p:64:64:24" - -; CHECK: Pointer preferred alignment must be a power of 2 - diff --git a/llvm/test/Assembler/invalid-datalayout22.ll b/llvm/test/Assembler/invalid-datalayout22.ll deleted file mode 100644 index 14e4c2822ce4b0..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout22.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "v128:0:128" - -; CHECK: ABI alignment specification must be >0 for non-aggregate types - diff --git a/llvm/test/Assembler/invalid-datalayout23.ll b/llvm/test/Assembler/invalid-datalayout23.ll deleted file mode 100644 index 430326327bc116..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout23.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "i32:24:32" - -; CHECK: Invalid ABI alignment, must be a power of 2 - diff --git a/llvm/test/Assembler/invalid-datalayout24.ll b/llvm/test/Assembler/invalid-datalayout24.ll deleted file mode 100644 index 616ec64518a5b9..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout24.ll +++ /dev/null @@ -1,6 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s - -target datalayout = "i32:32:24" - -; CHECK: Invalid preferred alignment, must be a power of 2 - diff --git a/llvm/test/Assembler/invalid-datalayout3.ll b/llvm/test/Assembler/invalid-datalayout3.ll deleted file mode 100644 index 44535fd055b5ea..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout3.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "n0" -; CHECK: Zero width native integer type in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout4.ll b/llvm/test/Assembler/invalid-datalayout4.ll deleted file mode 100644 index 99a6a6093954e1..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout4.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "p16777216:64:64:64" -; CHECK: Invalid address space, must be a 24-bit integer diff --git a/llvm/test/Assembler/invalid-datalayout5.ll b/llvm/test/Assembler/invalid-datalayout5.ll deleted file mode 100644 index 3ce8791c0870b4..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout5.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "a1:64" -; CHECK: Sized aggregate specification in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout6.ll b/llvm/test/Assembler/invalid-datalayout6.ll deleted file mode 100644 index 425099f7cad869..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout6.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "a:" -; CHECK: Trailing separator in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout7.ll b/llvm/test/Assembler/invalid-datalayout7.ll deleted file mode 100644 index 5e010710889f6d..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout7.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "p:48:52" -; CHECK: number of bits must be a byte width multiple diff --git a/llvm/test/Assembler/invalid-datalayout8.ll b/llvm/test/Assembler/invalid-datalayout8.ll deleted file mode 100644 index 28832ffb17dd05..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout8.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "e-p" -; CHECK: Missing size specification for pointer in datalayout string diff --git a/llvm/test/Assembler/invalid-datalayout9.ll b/llvm/test/Assembler/invalid-datalayout9.ll deleted file mode 100644 index dfeac65cf604d1..00000000000000 --- a/llvm/test/Assembler/invalid-datalayout9.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: not llvm-as < %s 2>&1 | FileCheck %s -target datalayout = "e-p:64" -; CHECK: Missing alignment specification for pointer in datalayout string diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 764f148ecd3aab..5a5dee0b53d439 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -16,14 +16,13 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h ; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NEXT: uunpklo z1.h, z0.b ; CHECK-NEXT: uunpkhi z0.h, z0.b -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: mvn w8, w8 +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpklo z5.s, z0.h @@ -31,15 +30,15 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpkhi p3.h, p1.b ; CHECK-NEXT: uunpklo z0.d, z2.s ; CHECK-NEXT: uunpkhi z1.d, z2.s -; CHECK-NEXT: punpkhi p3.h, p1.b +; CHECK-NEXT: punpklo p5.h, p0.b ; CHECK-NEXT: uunpklo z2.d, z3.s ; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: punpklo p5.h, p0.b +; CHECK-NEXT: punpkhi p7.h, p0.b ; CHECK-NEXT: uunpklo z4.d, z5.s ; CHECK-NEXT: uunpkhi z5.d, z5.s -; CHECK-NEXT: punpkhi p7.h, p0.b ; CHECK-NEXT: uunpklo z6.d, z7.s ; CHECK-NEXT: uunpkhi z7.d, z7.s ; CHECK-NEXT: punpklo p0.h, p2.b diff --git a/llvm/test/CodeGen/AArch64/peephole-sxtw.mir b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir index 274d434bbec674..22eec1a4dc038d 100644 --- a/llvm/test/CodeGen/AArch64/peephole-sxtw.mir +++ b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir @@ -95,3 +95,57 @@ body: | B %bb.2 ... +--- +name: removeUxtw +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: removeUxtw + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]].sub_32 + ; CHECK-NEXT: [[ORRWrr:%[0-9]+]]:gpr32 = ORRWrr $wzr, [[COPY1]] + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrr]], %subreg.sub_32 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32 + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY2]], 1, 0 + ; CHECK-NEXT: $w0 = COPY [[ADDWri]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr64 = COPY $x0 + %1:gpr32 = COPY %0.sub_32 + %2:gpr32 = ORRWrr $wzr, %1 + %3:gpr64 = SUBREG_TO_REG 0, %2, %subreg.sub_32 + %4:gpr32sp = COPY %3.sub_32 + %5:gpr32sp = ADDWri %4, 1, 0 + $w0 = COPY %5 + RET_ReallyLR implicit $w0 +... +--- +name: extraUseOrr +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: extraUseOrr + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]].sub_32 + ; CHECK-NEXT: [[ORRWrr:%[0-9]+]]:gpr32 = ORRWrr $wzr, [[COPY1]] + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[ORRWrr]], %subreg.sub_32 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32 + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY2]], 1, 0 + ; CHECK-NEXT: $w0 = COPY [[ADDWri]] + ; CHECK-NEXT: $w1 = COPY [[ORRWrr]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr64 = COPY $x0 + %1:gpr32 = COPY %0.sub_32 + %2:gpr32 = ORRWrr $wzr, %1 + %3:gpr64 = SUBREG_TO_REG 0, %2, %subreg.sub_32 + %4:gpr32sp = COPY %3.sub_32 + %5:gpr32sp = ADDWri %4, 1, 0 + $w0 = COPY %5 + $w1 = COPY %2 + RET_ReallyLR implicit $w0 +... diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll index 595991e86a91c7..9fbce05eee1775 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -41,8 +41,8 @@ define i1 @test_srem_even(i4 %X) nounwind { define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; CHECK-LABEL: test_srem_pow2_setne: ; CHECK: // %bb.0: -; CHECK-NEXT: sbfx w8, w0, #0, #6 -; CHECK-NEXT: ubfx w8, w8, #9, #2 +; CHECK-NEXT: sbfx w8, w0, #5, #1 +; CHECK-NEXT: and w8, w8, #0x3 ; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: and w8, w8, #0x3c ; CHECK-NEXT: sub w8, w0, w8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll deleted file mode 100644 index b0c6e89380d810..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ /dev/null @@ -1,153 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s - -; TODO: Merge with DAG test - -define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { -; CI-LABEL: is_private_vgpr: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x32 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 -; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CI-NEXT: flat_store_dword v[0:1], v0 -; CI-NEXT: s_endpgm -; -; GFX9-LABEL: is_private_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: is_private_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: is_private_vgpr: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id - %ptr = load volatile ptr, ptr addrspace(1) %gep - %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) - %ext = zext i1 %val to i32 - store i32 %ext, ptr addrspace(1) undef - ret void -} - -define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { -; CI-LABEL: is_private_sgpr: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x32 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cbranch_scc1 .LBB1_2 -; CI-NEXT: ; %bb.1: ; %bb0 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: flat_store_dword v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: .LBB1_2: ; %bb1 -; CI-NEXT: s_endpgm -; -; GFX9-LABEL: is_private_sgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s1, s3 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX9-NEXT: ; %bb.1: ; %bb0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_2: ; %bb1 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: is_private_sgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 -; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX10-NEXT: ; %bb.1: ; %bb0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB1_2: ; %bb1 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: is_private_sgpr: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, s3 -; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX11-NEXT: ; %bb.1: ; %bb0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_endpgm - %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) - br i1 %val, label %bb0, label %bb1 - -bb0: - store volatile i32 0, ptr addrspace(1) undef - br label %bb1 - -bb1: - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i1 @llvm.amdgcn.is.private(ptr nocapture) #0 - -attributes #0 = { nounwind readnone speculatable } - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll deleted file mode 100644 index bbcb807a956bee..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ /dev/null @@ -1,153 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s - -; TODO: Merge with DAG test - -define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { -; CI-LABEL: is_local_vgpr: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x33 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 -; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CI-NEXT: flat_store_dword v[0:1], v0 -; CI-NEXT: s_endpgm -; -; GFX9-LABEL: is_local_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: is_local_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: is_local_vgpr: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id - %ptr = load volatile ptr, ptr addrspace(1) %gep - %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) - %ext = zext i1 %val to i32 - store i32 %ext, ptr addrspace(1) undef - ret void -} - -define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { -; CI-LABEL: is_local_sgpr: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x33 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_cmp_lg_u32 s1, s0 -; CI-NEXT: s_cbranch_scc1 .LBB1_2 -; CI-NEXT: ; %bb.1: ; %bb0 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: flat_store_dword v[0:1], v0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: .LBB1_2: ; %bb1 -; CI-NEXT: s_endpgm -; -; GFX9-LABEL: is_local_sgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s1, s3 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX9-NEXT: ; %bb.1: ; %bb0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_2: ; %bb1 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: is_local_sgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 -; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX10-NEXT: ; %bb.1: ; %bb0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB1_2: ; %bb1 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: is_local_sgpr: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, s3 -; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 -; GFX11-NEXT: ; %bb.1: ; %bb0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB1_2: ; %bb1 -; GFX11-NEXT: s_endpgm - %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) - br i1 %val, label %bb0, label %bb1 - -bb0: - store volatile i32 0, ptr addrspace(1) undef - br label %bb1 - -bb1: - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #0 - -attributes #0 = { nounwind readnone speculatable } - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll index 2c7072b8c93b11..2acd2355965a59 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel -enable-new-pm | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=amdgpu-isel -stop-after=amdgpu-isel -enable-new-pm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK ; This caused failure in infinite cycle in Selection DAG (combine) due to missing insert_subvector. ; diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 386f9cd3f9ce73..aa182b720c6042 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -15,7 +15,7 @@ define internal void @direct() { ; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; CHECK-NEXT: call void [[FP]]() +; CHECK-NEXT: call void @indirect() ; CHECK-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() { } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 05558c555c581e..848019c8729251 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; %fptr = alloca ptr, addrspace(5) @@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 0076079ce17c77..42e8b2608dc1c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -1,18 +1,106 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIH %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s -; GCN-LABEL: {{^}}is_private_vgpr: -; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] -; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] - -; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base -; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] - -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { +; SI-LABEL: is_private_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x32 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-SDAG-LABEL: is_private_vgpr: +; CI-SDAG: ; %bb.0: +; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-SDAG-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) +; CI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 +; CI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 +; CI-SDAG-NEXT: s_endpgm +; +; GFX9-LABEL: is_private_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_endpgm +; +; CI-GISEL-LABEL: is_private_vgpr: +; CI-GISEL: ; %bb.0: +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) +; CI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 +; CI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 +; CI-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: is_private_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_private_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id %ptr = load volatile ptr, ptr addrspace(1) %gep @@ -24,20 +112,113 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in ; select and vcc branch. - -; GCN-LABEL: {{^}}is_private_sgpr: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x1{{$}} - -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x32{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} - -; CI: s_cmp_eq_u32 [[APERTURE]], [[PTR_HI]] - -; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base -; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] - -; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { +; SI-LABEL: is_private_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s0, s[6:7], 0x1 +; SI-NEXT: s_load_dword s1, s[6:7], 0x32 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cbranch_vccnz .LBB1_2 +; SI-NEXT: ; %bb.1: ; %bb0 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB1_2: ; %bb1 +; SI-NEXT: s_endpgm +; +; CI-SDAG-LABEL: is_private_sgpr: +; CI-SDAG: ; %bb.0: +; CI-SDAG-NEXT: s_load_dword s0, s[6:7], 0x1 +; CI-SDAG-NEXT: s_load_dword s1, s[6:7], 0x32 +; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 +; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 +; CI-SDAG-NEXT: ; %bb.1: ; %bb0 +; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) +; CI-SDAG-NEXT: .LBB1_2: ; %bb1 +; CI-SDAG-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: is_private_sgpr: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1 +; GFX9-SDAG-NEXT: s_endpgm +; +; CI-GISEL-LABEL: is_private_sgpr: +; CI-GISEL: ; %bb.0: +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x32 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 +; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 +; CI-GISEL-NEXT: ; %bb.1: ; %bb0 +; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) +; CI-GISEL-NEXT: .LBB1_2: ; %bb1 +; CI-GISEL-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: is_private_sgpr: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: is_private_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB1_2: ; %bb1 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_private_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %bb0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) br i1 %val, label %bb0, label %bb1 @@ -49,10 +230,10 @@ bb1: ret void } -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i1 @llvm.amdgcn.is.private(ptr nocapture) #0 - -attributes #0 = { nounwind readnone speculatable } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX11-GISEL: {{.*}} +; SI-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index e24c47991fe3d7..f8e60e5eb09a16 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -1,18 +1,139 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIH %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s -; GCN-LABEL: {{^}}is_local_vgpr: -; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 - -; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base -; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] - -; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] -; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { +; CIT-LABEL: is_local_vgpr: +; CIT: ; %bb.0: +; CIT-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIT-NEXT: s_load_dword s4, s[6:7], 0x33 +; CIT-NEXT: s_mov_b32 s2, 0 +; CIT-NEXT: s_mov_b32 s3, 0x100f000 +; CIT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CIT-NEXT: v_mov_b32_e32 v1, 0 +; CIT-NEXT: s_waitcnt lgkmcnt(0) +; CIT-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; CIT-NEXT: s_waitcnt vmcnt(0) +; CIT-NEXT: s_mov_b32 s2, -1 +; CIT-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; CIT-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CIT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CIT-NEXT: s_endpgm +; +; CIH-LABEL: is_local_vgpr: +; CIH: ; %bb.0: +; CIH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIH-NEXT: s_load_dword s2, s[6:7], 0x33 +; CIH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CIH-NEXT: s_waitcnt lgkmcnt(0) +; CIH-NEXT: v_mov_b32_e32 v1, s1 +; CIH-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CIH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CIH-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CIH-NEXT: s_waitcnt vmcnt(0) +; CIH-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 +; CIH-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CIH-NEXT: flat_store_dword v[0:1], v0 +; CIH-NEXT: s_endpgm +; +; SI-LABEL: is_local_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[6:7], 0x33 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; CI-SDAG-LABEL: is_local_vgpr: +; CI-SDAG: ; %bb.0: +; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-SDAG-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) +; CI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 +; CI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 +; CI-SDAG-NEXT: s_endpgm +; +; GFX9-LABEL: is_local_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_endpgm +; +; CI-GISEL-LABEL: is_local_vgpr: +; CI-GISEL: ; %bb.0: +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) +; CI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s2, v1 +; CI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 +; CI-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: is_local_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_local_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id %ptr = load volatile ptr, ptr addrspace(1) %gep @@ -24,19 +145,147 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in ; select and vcc branch. - -; GCN-LABEL: {{^}}is_local_sgpr: -; CI-DAG: s_load_dword s0, s[6:7], 0x1 - -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x33{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} - -; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base -; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] - -; CI: s_cmp_eq_u32 s0, [[PTR_HI]] -; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { +; CIT-LABEL: is_local_sgpr: +; CIT: ; %bb.0: +; CIT-NEXT: s_load_dword s0, s[6:7], 0x1 +; CIT-NEXT: s_load_dword s1, s[6:7], 0x33 +; CIT-NEXT: s_waitcnt lgkmcnt(0) +; CIT-NEXT: s_cmp_eq_u32 s0, s1 +; CIT-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CIT-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CIT-NEXT: s_cbranch_vccnz .LBB1_2 +; CIT-NEXT: ; %bb.1: ; %bb0 +; CIT-NEXT: s_mov_b32 s3, 0x100f000 +; CIT-NEXT: s_mov_b32 s2, -1 +; CIT-NEXT: v_mov_b32_e32 v0, 0 +; CIT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CIT-NEXT: s_waitcnt vmcnt(0) +; CIT-NEXT: .LBB1_2: ; %bb1 +; CIT-NEXT: s_endpgm +; +; CIH-LABEL: is_local_sgpr: +; CIH: ; %bb.0: +; CIH-NEXT: s_load_dword s0, s[6:7], 0x1 +; CIH-NEXT: s_load_dword s1, s[6:7], 0x33 +; CIH-NEXT: s_waitcnt lgkmcnt(0) +; CIH-NEXT: s_cmp_eq_u32 s0, s1 +; CIH-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CIH-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CIH-NEXT: s_cbranch_vccnz .LBB1_2 +; CIH-NEXT: ; %bb.1: ; %bb0 +; CIH-NEXT: v_mov_b32_e32 v0, 0 +; CIH-NEXT: flat_store_dword v[0:1], v0 +; CIH-NEXT: s_waitcnt vmcnt(0) +; CIH-NEXT: .LBB1_2: ; %bb1 +; CIH-NEXT: s_endpgm +; +; SI-LABEL: is_local_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s0, s[6:7], 0x1 +; SI-NEXT: s_load_dword s1, s[6:7], 0x33 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cbranch_vccnz .LBB1_2 +; SI-NEXT: ; %bb.1: ; %bb0 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB1_2: ; %bb1 +; SI-NEXT: s_endpgm +; +; CI-SDAG-LABEL: is_local_sgpr: +; CI-SDAG: ; %bb.0: +; CI-SDAG-NEXT: s_load_dword s0, s[6:7], 0x1 +; CI-SDAG-NEXT: s_load_dword s1, s[6:7], 0x33 +; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 +; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CI-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 +; CI-SDAG-NEXT: ; %bb.1: ; %bb0 +; CI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CI-SDAG-NEXT: flat_store_dword v[0:1], v0 +; CI-SDAG-NEXT: s_waitcnt vmcnt(0) +; CI-SDAG-NEXT: .LBB1_2: ; %bb1 +; CI-SDAG-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: is_local_sgpr: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1 +; GFX9-SDAG-NEXT: s_endpgm +; +; CI-GISEL-LABEL: is_local_sgpr: +; CI-GISEL: ; %bb.0: +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x33 +; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 +; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 +; CI-GISEL-NEXT: ; %bb.1: ; %bb0 +; CI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; CI-GISEL-NEXT: flat_store_dword v[0:1], v0 +; CI-GISEL-NEXT: s_waitcnt vmcnt(0) +; CI-GISEL-NEXT: .LBB1_2: ; %bb1 +; CI-GISEL-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: is_local_sgpr: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB1_2: ; %bb1 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX10-LABEL: is_local_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB1_2: ; %bb1 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_local_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %bb0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) br i1 %val, label %bb0, label %bb1 @@ -48,10 +297,10 @@ bb1: ret void } -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #0 - -attributes #0 = { nounwind readnone speculatable } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; GFX10-GISEL: {{.*}} +; GFX11-GISEL: {{.*}} +; SI-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index bf98af33dc7b08..050300a69c46bb 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1049,15 +1049,14 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9 ; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9 ; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1 -; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off @@ -1075,23 +1074,22 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dword v9, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 26 -; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, 7 ; GFX9-NEXT: s_mov_b32 s4, 0x4010707 +; GFX9-NEXT: v_mov_b32_e32 v0, 26 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4 +; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 +; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 25, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9 -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v3, off +; GFX9-NEXT: global_store_dword v[7:8], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll index 8a467812ec4850..bf4e02d8d7e1c1 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll @@ -3,10 +3,10 @@ ; The types of the users of the addrspacecast should not be changed. ; CHECK-LABEL: @invalid_bitcast_addrspace( -; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [1 x i32]], ptr addrspace(3) @invalid_bitcast_addrspace.data, i32 0, i32 %{{[0-9]+}} -; CHECK: [[ASC:%[a-z0-9]+]] = addrspacecast ptr addrspace(3) [[GEP]] to ptr -; CHECK: [[LOAD:%[a-z0-9]+]] = load <2 x i16>, ptr [[ASC]] -; CHECK: bitcast <2 x i16> [[LOAD]] to <2 x half> +; CHECK: alloca +; CHECK: addrspacecast +; CHECK: load +; CHECK: bitcast define amdgpu_kernel void @invalid_bitcast_addrspace() #0 { entry: %data = alloca [1 x i32], addrspace(5) @@ -16,4 +16,22 @@ entry: ret void } +; A callee use is not promotable even if it has a nocapture attribute. +define void @nocapture_callee(ptr nocapture noundef writeonly %flat.observes.addrspace) #0 { + %private.ptr = addrspacecast ptr %flat.observes.addrspace to ptr addrspace(5) + store i32 1, ptr addrspace(5) %private.ptr, align 4 + ret void +} + +; CHECK-LABEL: @kernel_call_nocapture( +; CHECK: alloca i32 +; CHECK-NEXT: addrspacecast +; CHECK-NEXT: call +define amdgpu_kernel void @kernel_call_nocapture() #0 { + %alloca = alloca i32, align 4, addrspace(5) + %flat.alloca = addrspacecast ptr addrspace(5) %alloca to ptr + call void @nocapture_callee(ptr noundef %flat.alloca) + ret void +} + attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index c2b43f9e000585..850446c414049d 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -32,14 +32,22 @@ entry: } define amdgpu_kernel void @foo(ptr noundef %fp) { -; CHECK-LABEL: define {{[^@]+}}@foo -; CHECK-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -; CHECK-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 -; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 -; CHECK-NEXT: call void [[LOAD]]() -; CHECK-NEXT: ret void +; OW-LABEL: define {{[^@]+}}@foo +; OW-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] { +; OW-NEXT: entry: +; OW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +; OW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 +; OW-NEXT: call void [[FP]]() +; OW-NEXT: ret void +; +; CW-LABEL: define {{[^@]+}}@foo +; CW-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] { +; CW-NEXT: entry: +; CW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +; CW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8 +; CW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8 +; CW-NEXT: call void @bar() +; CW-NEXT: ret void ; entry: %fp.addr = alloca ptr, addrspace(5) @@ -50,9 +58,9 @@ entry: } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +;. +; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CW: {{.*}} -; OW: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 3a6b0485d24174..cca7b49996ff3b 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) ; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 ; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: call void @indirect() ; ATTRIBUTOR_GCN-NEXT: ret void ; ; GFX9-LABEL: test_simple_indirect_call: @@ -81,7 +81,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll index 126b17e718b59f..2efe27df2d10d1 100644 --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -43,8 +43,8 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; CHECK-LABEL: test_srem_pow2_setne: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_bfe_i32 v1, v0, 0, 6 -; CHECK-NEXT: v_bfe_u32 v1, v1, 9, 2 +; CHECK-NEXT: v_bfe_i32 v1, v0, 5, 1 +; CHECK-NEXT: v_and_b32_e32 v1, 3, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; CHECK-NEXT: v_and_b32_e32 v1, 60, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll index a4e081d5384e5e..7f56215b9b4123 100644 --- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -209,8 +209,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ARM5: @ %bb.0: ; ARM5-NEXT: lsl r1, r0, #26 ; ARM5-NEXT: mov r2, #3 -; ARM5-NEXT: asr r1, r1, #26 -; ARM5-NEXT: and r1, r2, r1, lsr #9 +; ARM5-NEXT: and r1, r2, r1, asr #31 ; ARM5-NEXT: add r1, r0, r1 ; ARM5-NEXT: and r1, r1, #60 ; ARM5-NEXT: sub r0, r0, r1 @@ -222,8 +221,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ARM6: @ %bb.0: ; ARM6-NEXT: lsl r1, r0, #26 ; ARM6-NEXT: mov r2, #3 -; ARM6-NEXT: asr r1, r1, #26 -; ARM6-NEXT: and r1, r2, r1, lsr #9 +; ARM6-NEXT: and r1, r2, r1, asr #31 ; ARM6-NEXT: add r1, r0, r1 ; ARM6-NEXT: and r1, r1, #60 ; ARM6-NEXT: sub r0, r0, r1 @@ -233,8 +231,9 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ; ARM7-LABEL: test_srem_pow2_setne: ; ARM7: @ %bb.0: -; ARM7-NEXT: sbfx r1, r0, #0, #6 -; ARM7-NEXT: ubfx r1, r1, #9, #2 +; ARM7-NEXT: lsl r1, r0, #26 +; ARM7-NEXT: mov r2, #3 +; ARM7-NEXT: and r1, r2, r1, asr #31 ; ARM7-NEXT: add r1, r0, r1 ; ARM7-NEXT: and r1, r1, #60 ; ARM7-NEXT: sub r0, r0, r1 @@ -244,8 +243,9 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ; ARM8-LABEL: test_srem_pow2_setne: ; ARM8: @ %bb.0: -; ARM8-NEXT: sbfx r1, r0, #0, #6 -; ARM8-NEXT: ubfx r1, r1, #9, #2 +; ARM8-NEXT: lsl r1, r0, #26 +; ARM8-NEXT: mov r2, #3 +; ARM8-NEXT: and r1, r2, r1, asr #31 ; ARM8-NEXT: add r1, r0, r1 ; ARM8-NEXT: and r1, r1, #60 ; ARM8-NEXT: sub r0, r0, r1 @@ -255,8 +255,9 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ; NEON7-LABEL: test_srem_pow2_setne: ; NEON7: @ %bb.0: -; NEON7-NEXT: sbfx r1, r0, #0, #6 -; NEON7-NEXT: ubfx r1, r1, #9, #2 +; NEON7-NEXT: lsl r1, r0, #26 +; NEON7-NEXT: mov r2, #3 +; NEON7-NEXT: and r1, r2, r1, asr #31 ; NEON7-NEXT: add r1, r0, r1 ; NEON7-NEXT: and r1, r1, #60 ; NEON7-NEXT: sub r0, r0, r1 @@ -266,8 +267,9 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; ; NEON8-LABEL: test_srem_pow2_setne: ; NEON8: @ %bb.0: -; NEON8-NEXT: sbfx r1, r0, #0, #6 -; NEON8-NEXT: ubfx r1, r1, #9, #2 +; NEON8-NEXT: lsl r1, r0, #26 +; NEON8-NEXT: mov r2, #3 +; NEON8-NEXT: and r1, r2, r1, asr #31 ; NEON8-NEXT: add r1, r0, r1 ; NEON8-NEXT: and r1, r1, #60 ; NEON8-NEXT: sub r0, r0, r1 diff --git a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll index b9a8e3e80567e0..318d5a6210eeea 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.0.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.0-vertex" diff --git a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll index fdd21d627829b9..fb54fa916f33f9 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/dxilVer-1.8.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.8-compute" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-as.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-as.ll index d2625fc8b96a9d..96d04f948c9b83 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-as.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-as.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6-amplification" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll index 24eb0d608d8bb6..8cba445bcb01e8 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-cs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -dxil-prepare %s | FileCheck %s --check-prefix=REMOVE_EXTRA_ATTRIBUTE ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-gs.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-gs.ll index 5c28c9305f01bf..662620cf9f95cb 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-gs.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-gs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.6-geometry" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-hs.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-hs.ll index e60023d1b3a5fb..b405f8e915a329 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-hs.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-hs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.6-hull" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-lib.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-lib.ll index 7f0bea95c04822..26f3d287242edd 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-lib.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-lib.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.3-library" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ms.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ms.ll index dd033b9a9722b9..422d4add912f3f 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ms.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ms.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel6.6-mesh" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ps.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ps.ll index 47da321df3e42e..cdb9a6f0f6a4f4 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ps.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-ps.ll @@ -1,5 +1,4 @@ - -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel5.0-pixel" diff --git a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-vs.ll b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-vs.ll index dcc68586dd4b9f..6b3501cc1dbaf7 100644 --- a/llvm/test/CodeGen/DirectX/Metadata/shaderModel-vs.ll +++ b/llvm/test/CodeGen/DirectX/Metadata/shaderModel-vs.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata %s | FileCheck %s ; RUN: opt -S -passes="print" -disable-output %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS target triple = "dxil-pc-shadermodel-vertex" diff --git a/llvm/test/CodeGen/DirectX/UAVMetadata.ll b/llvm/test/CodeGen/DirectX/UAVMetadata.ll index bdad9fd40c9bd3..b10112a044df58 100644 --- a/llvm/test/CodeGen/DirectX/UAVMetadata.ll +++ b/llvm/test/CodeGen/DirectX/UAVMetadata.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s ; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT diff --git a/llvm/test/CodeGen/DirectX/cbuf.ll b/llvm/test/CodeGen/DirectX/cbuf.ll index 38f08fad995d1f..e31a659728fcf2 100644 --- a/llvm/test/CodeGen/DirectX/cbuf.ll +++ b/llvm/test/CodeGen/DirectX/cbuf.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXILMD ; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" diff --git a/llvm/test/CodeGen/DirectX/dxil_ver.ll b/llvm/test/CodeGen/DirectX/dxil_ver.ll index e9923a3abce02d..3c1d2e81020098 100644 --- a/llvm/test/CodeGen/DirectX/dxil_ver.ll +++ b/llvm/test/CodeGen/DirectX/dxil_ver.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-pc-shadermodel6.3-library" diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll index 0cfb839746b93e..1a618092c5fed3 100644 --- a/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll +++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_0.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXILMD target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-unknown-shadermodel6.7-library" diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll index b6d29f8d18d79f..6886f2690209dc 100644 --- a/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll +++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_1.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXILMD target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-unknown-shadermodel6.7-library" diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll index d023d7906fdc52..3b08b25542201c 100644 --- a/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll +++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_2.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXILMD target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-unknown-shadermodel6.7-library" diff --git a/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll b/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll index 38c2cd18b5ca1d..f01afbdab96733 100644 --- a/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll +++ b/llvm/test/CodeGen/DirectX/legacy_cb_layout_3.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXILMD target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-unknown-shadermodel6.7-library" diff --git a/llvm/test/CodeGen/DirectX/lib_entry.ll b/llvm/test/CodeGen/DirectX/lib_entry.ll index 5254a088055888..9aa63c26ce845a 100644 --- a/llvm/test/CodeGen/DirectX/lib_entry.ll +++ b/llvm/test/CodeGen/DirectX/lib_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s +; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" target triple = "dxil-unknown-shadermodel6.7-library" diff --git a/llvm/test/CodeGen/DirectX/normalize.ll b/llvm/test/CodeGen/DirectX/normalize.ll index f3533cc56e7c25..e2c8a5d4656a65 100644 --- a/llvm/test/CodeGen/DirectX/normalize.ll +++ b/llvm/test/CodeGen/DirectX/normalize.ll @@ -22,7 +22,6 @@ entry: define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) { entry: - ; CHECK: extractelement <2 x half> %{{.*}}, i64 0 ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]]) @@ -37,7 +36,6 @@ entry: define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) { entry: - ; CHECK: extractelement <3 x half> %{{.*}}, i64 0 ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]]) @@ -52,7 +50,6 @@ entry: define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) { entry: - ; CHECK: extractelement <4 x half> %{{.*}}, i64 0 ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]]) @@ -74,7 +71,6 @@ entry: define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) { entry: - ; CHECK: extractelement <2 x float> %{{.*}}, i64 0 ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]]) @@ -89,7 +85,6 @@ entry: define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) { entry: - ; CHECK: extractelement <3 x float> %{{.*}}, i64 0 ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]]) @@ -104,7 +99,6 @@ entry: define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) { entry: - ; CHECK: extractelement <4 x float> %{{.*}}, i64 0 ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]]) diff --git a/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll new file mode 100644 index 00000000000000..b4fdd954b856c8 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fp-maximumnum-minimumnum.ll @@ -0,0 +1,431 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+d < %s | FileCheck %s --check-prefix=LA32D +; RUN: llc --mtriple=loongarch64 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA64F +; RUN: llc --mtriple=loongarch64 --mattr=+d < %s | FileCheck %s --check-prefix=LA64D + +declare float @llvm.maximumnum.f32(float, float) +declare double @llvm.maximumnum.f64(double, double) +declare float @llvm.minimumnum.f32(float, float) +declare double @llvm.minimumnum.f64(double, double) + +define float @maximumnum_float(float %x, float %y) { +; +; LA32F-LABEL: maximumnum_float: +; LA32F: # %bb.0: +; LA32F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_float: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_float: +; LA64F: # %bb.0: +; LA64F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_float: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + +define float @maximumnum_float_nsz(float %x, float %y) { +; +; LA32F-LABEL: maximumnum_float_nsz: +; LA32F: # %bb.0: +; LA32F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_float_nsz: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_float_nsz: +; LA64F: # %bb.0: +; LA64F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_float_nsz: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nsz float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + +define float @maximumnum_float_nnan(float %x, float %y) { +; +; LA32F-LABEL: maximumnum_float_nnan: +; LA32F: # %bb.0: +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_float_nnan: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_float_nnan: +; LA64F: # %bb.0: +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_float_nnan: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nnan float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + + +define double @maximumnum_double(double %x, double %y) { +; +; LA32F-LABEL: maximumnum_double: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fmaximum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_double: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_double: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fmaximum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_double: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define double @maximumnum_double_nsz(double %x, double %y) { +; +; LA32F-LABEL: maximumnum_double_nsz: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fmaximum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_double_nsz: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_double_nsz: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fmaximum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_double_nsz: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nsz double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define double @maximumnum_double_nnan(double %x, double %y) { +; +; LA32F-LABEL: maximumnum_double_nnan: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fmaximum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: maximumnum_double_nnan: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: maximumnum_double_nnan: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fmaximum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: maximumnum_double_nnan: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nnan double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define float @minimumnum_float(float %x, float %y) { +; +; LA32F-LABEL: minimumnum_float: +; LA32F: # %bb.0: +; LA32F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_float: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_float: +; LA64F: # %bb.0: +; LA64F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_float: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define float @minimumnum_float_nsz(float %x, float %y) { +; +; LA32F-LABEL: minimumnum_float_nsz: +; LA32F: # %bb.0: +; LA32F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_float_nsz: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA32D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_float_nsz: +; LA64F: # %bb.0: +; LA64F-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64F-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_float_nsz: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.s $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.s $fa0, $fa0, $fa0 +; LA64D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nsz float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define float @minimumnum_float_nnan(float %x, float %y) { +; +; LA32F-LABEL: minimumnum_float_nnan: +; LA32F: # %bb.0: +; LA32F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_float_nnan: +; LA32D: # %bb.0: +; LA32D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_float_nnan: +; LA64F: # %bb.0: +; LA64F-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_float_nnan: +; LA64D: # %bb.0: +; LA64D-NEXT: fmin.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nnan float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define double @minimumnum_double(double %x, double %y) { +; +; LA32F-LABEL: minimumnum_double: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fminimum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_double: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA32D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_double: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fminimum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_double: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA64D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} + +define double @minimumnum_double_nsz(double %x, double %y) { +; +; LA32F-LABEL: minimumnum_double_nsz: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fminimum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_double_nsz: +; LA32D: # %bb.0: +; LA32D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA32D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA32D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_double_nsz: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fminimum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_double_nsz: +; LA64D: # %bb.0: +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa1 +; LA64D-NEXT: fmax.d $fa0, $fa0, $fa0 +; LA64D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nsz double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} + +define double @minimumnum_double_nnan(double %x, double %y) { +; +; LA32F-LABEL: minimumnum_double_nnan: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(fminimum_num) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32D-LABEL: minimumnum_double_nnan: +; LA32D: # %bb.0: +; LA32D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA32D-NEXT: ret +; +; LA64F-LABEL: minimumnum_double_nnan: +; LA64F: # %bb.0: +; LA64F-NEXT: addi.d $sp, $sp, -16 +; LA64F-NEXT: .cfi_def_cfa_offset 16 +; LA64F-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-NEXT: .cfi_offset 1, -8 +; LA64F-NEXT: bl %plt(fminimum_num) +; LA64F-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-NEXT: addi.d $sp, $sp, 16 +; LA64F-NEXT: ret +; +; LA64D-LABEL: minimumnum_double_nnan: +; LA64D: # %bb.0: +; LA64D-NEXT: fmin.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret + %z = call nnan double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} diff --git a/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll new file mode 100644 index 00000000000000..bc81966ca0f5c9 --- /dev/null +++ b/llvm/test/CodeGen/Mips/fp-maximumnum-minimumnum.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6 + +declare float @llvm.maximumnum.f32(float, float) +declare double @llvm.maximumnum.f64(double, double) +declare float @llvm.minimumnum.f32(float, float) +declare double @llvm.minimumnum.f64(double, double) + +define float @maximumnum_float(float %x, float %y) { +; MIPS32R6-LABEL: maximumnum_float: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f0, $f14, $f14 +; MIPS32R6-NEXT: min.s $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f1, $f0 + %z = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + +define float @maximumnum_float_nsz(float %x, float %y) { +; MIPS32R6-LABEL: maximumnum_float_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f0, $f14, $f14 +; MIPS32R6-NEXT: min.s $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f1, $f0 + %z = call nsz float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + +define float @maximumnum_float_nnan(float %x, float %y) { +; MIPS32R6-LABEL: maximumnum_float_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f12, $f14 + %z = call nnan float @llvm.maximumnum.f32(float %x, float %y) + ret float %z +} + + +define double @maximumnum_double(double %x, double %y) { +; MIPS32R6-LABEL: maximumnum_double: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f0, $f14, $f14 +; MIPS32R6-NEXT: min.d $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.d $f0, $f1, $f0 + %z = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define double @maximumnum_double_nsz(double %x, double %y) { +; MIPS32R6-LABEL: maximumnum_double_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f0, $f14, $f14 +; MIPS32R6-NEXT: min.d $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.d $f0, $f1, $f0 + %z = call nsz double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define double @maximumnum_double_nnan(double %x, double %y) { +; MIPS32R6-LABEL: maximumnum_double_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.d $f0, $f12, $f14 + %z = call nnan double @llvm.maximumnum.f64(double %x, double %y) + ret double %z +} + +define float @minimumnum_float(float %x, float %y) { +; MIPS32R6-LABEL: minimumnum_float: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f0, $f14, $f14 +; MIPS32R6-NEXT: min.s $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.s $f0, $f1, $f0 + %z = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define float @minimumnum_float_nsz(float %x, float %y) { +; MIPS32R6-LABEL: minimumnum_float_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f0, $f14, $f14 +; MIPS32R6-NEXT: min.s $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.s $f0, $f1, $f0 + %z = call nsz float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define float @minimumnum_float_nnan(float %x, float %y) { +; MIPS32R6-LABEL: minimumnum_float_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.s $f0, $f12, $f14 + %z = call nnan float @llvm.minimumnum.f32(float %x, float %y) + ret float %z +} + +define double @minimumnum_double(double %x, double %y) { +; MIPS32R6-LABEL: minimumnum_double: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f0, $f14, $f14 +; MIPS32R6-NEXT: min.d $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.d $f0, $f1, $f0 + %z = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} + +define double @minimumnum_double_nsz(double %x, double %y) { +; MIPS32R6-LABEL: minimumnum_double_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f0, $f14, $f14 +; MIPS32R6-NEXT: min.d $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.d $f0, $f1, $f0 + %z = call nsz double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} + +define double @minimumnum_double_nnan(double %x, double %y) { +; MIPS32R6-LABEL: minimumnum_double_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.d $f0, $f12, $f14 + %z = call nnan double @llvm.minimumnum.f64(double %x, double %y) + ret double %z +} diff --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll index 1a9fa27c263deb..37cca8687890a6 100644 --- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll @@ -90,8 +90,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; MIPSEL-LABEL: test_srem_pow2_setne: ; MIPSEL: # %bb.0: ; MIPSEL-NEXT: sll $1, $4, 26 -; MIPSEL-NEXT: sra $1, $1, 26 -; MIPSEL-NEXT: srl $1, $1, 9 +; MIPSEL-NEXT: sra $1, $1, 31 ; MIPSEL-NEXT: andi $1, $1, 3 ; MIPSEL-NEXT: addu $1, $4, $1 ; MIPSEL-NEXT: andi $1, $1, 60 @@ -104,8 +103,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; MIPS64EL: # %bb.0: ; MIPS64EL-NEXT: sll $1, $4, 0 ; MIPS64EL-NEXT: sll $2, $1, 26 -; MIPS64EL-NEXT: sra $2, $2, 26 -; MIPS64EL-NEXT: srl $2, $2, 9 +; MIPS64EL-NEXT: sra $2, $2, 31 ; MIPS64EL-NEXT: andi $2, $2, 3 ; MIPS64EL-NEXT: addu $2, $1, $2 ; MIPS64EL-NEXT: andi $2, $2, 60 diff --git a/llvm/test/CodeGen/NVPTX/dot-product.ll b/llvm/test/CodeGen/NVPTX/dot-product.ll new file mode 100644 index 00000000000000..36529bbef90332 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/dot-product.ll @@ -0,0 +1,222 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx -mcpu=sm_61 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_61 | FileCheck %s + +target triple = "nvptx-nvidia-cuda" + +declare i32 @llvm.nvvm.idp4a.s.s(i32, i32, i32) +declare i32 @llvm.nvvm.idp4a.s.u(i32, i32, i32) +declare i32 @llvm.nvvm.idp4a.u.s(i32, i32, i32) +declare i32 @llvm.nvvm.idp4a.u.u(i32, i32, i32) + +define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp4a_u32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_u32_param_2]; +; CHECK-NEXT: dp4a.u32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp4a.u.u(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + +define i32 @test_dp4a_u32imm_u32imm(i32 %c) { +; CHECK-LABEL: test_dp4a_u32imm_u32imm( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0]; +; CHECK-NEXT: mov.b32 %r2, 0; +; CHECK-NEXT: dp4a.u32.u32 %r3, %r2, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp4a.u.u(i32 0, i32 0, i32 %c) + ret i32 %call +} + +define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp4a_u32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_s32_param_2]; +; CHECK-NEXT: dp4a.u32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp4a.u.s(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + +define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp4a_s32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_u32_param_2]; +; CHECK-NEXT: dp4a.s32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp4a.s.u(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + +define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp4a_s32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_s32_param_2]; +; CHECK-NEXT: dp4a.s32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp4a.s.s(i32 %a, i32 %b, i32 %c) + ret i32 %call +} + +declare i32 @llvm.nvvm.idp2a.s.s(i32, i32, i1 immarg, i32) +declare i32 @llvm.nvvm.idp2a.s.u(i32, i32, i1 immarg, i32) +declare i32 @llvm.nvvm.idp2a.u.s(i32, i32, i1 immarg, i32) +declare i32 @llvm.nvvm.idp2a.u.u(i32, i32, i1 immarg, i32) + +define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_lo_u32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2]; +; CHECK-NEXT: dp2a.lo.u32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 0, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_lo_u32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2]; +; CHECK-NEXT: dp2a.lo.u32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 0, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_lo_s32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2]; +; CHECK-NEXT: dp2a.lo.s32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 0, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_lo_s32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2]; +; CHECK-NEXT: dp2a.lo.s32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 0, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_hi_u32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2]; +; CHECK-NEXT: dp2a.hi.u32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.u.u(i32 %a, i32 %b, i1 1, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_hi_u32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2]; +; CHECK-NEXT: dp2a.hi.u32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.u.s(i32 %a, i32 %b, i1 1, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_hi_s32_u32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_u32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2]; +; CHECK-NEXT: dp2a.hi.s32.u32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.s.u(i32 %a, i32 %b, i1 1, i32 %c) + ret i32 %call +} + +define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: test_dp2a_hi_s32_s32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_s32_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2]; +; CHECK-NEXT: dp2a.hi.s32.s32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %call = call i32 @llvm.nvvm.idp2a.s.s(i32 %a, i32 %b, i1 1, i32 %c) + ret i32 %call +} diff --git a/llvm/test/CodeGen/PowerPC/merge-private.ll b/llvm/test/CodeGen/PowerPC/merge-private.ll index 6cf276990d7ea2..6ed2d6dfc542b7 100644 --- a/llvm/test/CodeGen/PowerPC/merge-private.ll +++ b/llvm/test/CodeGen/PowerPC/merge-private.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ ; RUN: --check-prefix=AIX64 ; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ ; RUN: --check-prefix=AIX32 ; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ ; RUN: --check-prefix=LINUX64LE ; RUN: llc -verify-machineinstrs -mtriple powerpc64-unknown-linux -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ ; RUN: --check-prefix=LINUX64BE @.str = private unnamed_addr constant [15 x i8] c"Private global\00", align 1 @@ -24,7 +24,7 @@ define dso_local void @print_func() { ; AIX64-NEXT: stdu r1, -128(r1) ; AIX64-NEXT: std r0, 144(r1) ; AIX64-NEXT: std r31, 120(r1) # 8-byte Folded Spill -; AIX64-NEXT: ld r31, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r31, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: mr r3, r31 ; AIX64-NEXT: bl .puts[PR] ; AIX64-NEXT: nop @@ -43,7 +43,7 @@ define dso_local void @print_func() { ; AIX32-NEXT: stwu r1, -64(r1) ; AIX32-NEXT: stw r0, 72(r1) ; AIX32-NEXT: stw r31, 60(r1) # 4-byte Folded Spill -; AIX32-NEXT: lwz r31, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r31, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: mr r3, r31 ; AIX32-NEXT: bl .puts[PR] ; AIX32-NEXT: nop @@ -64,9 +64,9 @@ define dso_local void @print_func() { ; LINUX64LE-NEXT: .cfi_offset r30, -16 ; LINUX64LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; LINUX64LE-NEXT: stdu r1, -48(r1) -; LINUX64LE-NEXT: addis r3, r2, .L__ModuleStringPool@toc@ha +; LINUX64LE-NEXT: addis r3, r2, .L_MergedGlobals@toc@ha ; LINUX64LE-NEXT: std r0, 64(r1) -; LINUX64LE-NEXT: addi r30, r3, .L__ModuleStringPool@toc@l +; LINUX64LE-NEXT: addi r30, r3, .L_MergedGlobals@toc@l ; LINUX64LE-NEXT: mr r3, r30 ; LINUX64LE-NEXT: bl puts ; LINUX64LE-NEXT: nop @@ -87,9 +87,9 @@ define dso_local void @print_func() { ; LINUX64BE-NEXT: .cfi_def_cfa_offset 128 ; LINUX64BE-NEXT: .cfi_offset lr, 16 ; LINUX64BE-NEXT: .cfi_offset r30, -16 -; LINUX64BE-NEXT: addis r3, r2, .L__ModuleStringPool@toc@ha +; LINUX64BE-NEXT: addis r3, r2, .L_MergedGlobals@toc@ha ; LINUX64BE-NEXT: std r30, 112(r1) # 8-byte Folded Spill -; LINUX64BE-NEXT: addi r30, r3, .L__ModuleStringPool@toc@l +; LINUX64BE-NEXT: addi r30, r3, .L_MergedGlobals@toc@l ; LINUX64BE-NEXT: mr r3, r30 ; LINUX64BE-NEXT: bl puts ; LINUX64BE-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll index b0cc89d1828eda..2b07f27be021b1 100644 --- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll @@ -85,8 +85,8 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; PPC-LABEL: test_srem_pow2_setne: ; PPC: # %bb.0: ; PPC-NEXT: slwi 4, 3, 26 -; PPC-NEXT: srawi 4, 4, 26 -; PPC-NEXT: rlwinm 4, 4, 23, 30, 31 +; PPC-NEXT: srawi 4, 4, 31 +; PPC-NEXT: clrlwi 4, 4, 30 ; PPC-NEXT: add 4, 3, 4 ; PPC-NEXT: rlwinm 4, 4, 0, 26, 29 ; PPC-NEXT: sub 3, 3, 4 @@ -99,8 +99,8 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; PPC64LE-LABEL: test_srem_pow2_setne: ; PPC64LE: # %bb.0: ; PPC64LE-NEXT: slwi 4, 3, 26 -; PPC64LE-NEXT: srawi 4, 4, 26 -; PPC64LE-NEXT: rlwinm 4, 4, 23, 30, 31 +; PPC64LE-NEXT: srawi 4, 4, 31 +; PPC64LE-NEXT: clrlwi 4, 4, 30 ; PPC64LE-NEXT: add 4, 3, 4 ; PPC64LE-NEXT: rlwinm 4, 4, 0, 26, 29 ; PPC64LE-NEXT: sub 3, 3, 4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll index a2983c21e6c92d..54fc79216ec00f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll @@ -25,8 +25,7 @@ define i8 @abs8(i8 %x) { ; ; RV32ZBB-LABEL: abs8: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: slli a0, a0, 24 -; RV32ZBB-NEXT: srai a0, a0, 24 +; RV32ZBB-NEXT: sext.b a0, a0 ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: max a0, a0, a1 ; RV32ZBB-NEXT: ret @@ -42,8 +41,7 @@ define i8 @abs8(i8 %x) { ; ; RV64ZBB-LABEL: abs8: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: slli a0, a0, 56 -; RV64ZBB-NEXT: srai a0, a0, 56 +; RV64ZBB-NEXT: sext.b a0, a0 ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: max a0, a0, a1 ; RV64ZBB-NEXT: ret @@ -63,8 +61,7 @@ define i16 @abs16(i16 %x) { ; ; RV32ZBB-LABEL: abs16: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: slli a0, a0, 16 -; RV32ZBB-NEXT: srai a0, a0, 16 +; RV32ZBB-NEXT: sext.h a0, a0 ; RV32ZBB-NEXT: neg a1, a0 ; RV32ZBB-NEXT: max a0, a0, a1 ; RV32ZBB-NEXT: ret @@ -80,8 +77,7 @@ define i16 @abs16(i16 %x) { ; ; RV64ZBB-LABEL: abs16: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: slli a0, a0, 48 -; RV64ZBB-NEXT: srai a0, a0, 48 +; RV64ZBB-NEXT: sext.h a0, a0 ; RV64ZBB-NEXT: neg a1, a0 ; RV64ZBB-NEXT: max a0, a0, a1 ; RV64ZBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir index c2258642c9dd2a..cbafa76ed4cd42 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir @@ -26,14 +26,12 @@ body: | ; RV32ZBB-LABEL: name: abs_i8 ; RV32ZBB: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; RV32ZBB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 8 - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ASSERT_ZEXT]], [[C]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[ASHR]] - ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[ASHR]], [[SUB]] - ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; RV32ZBB-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SMAX]], [[C2]] + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASSERT_ZEXT]], 8 + ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SEXT_INREG]] + ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SUB]] + ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; RV32ZBB-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SMAX]], [[C1]] ; RV32ZBB-NEXT: $x10 = COPY [[AND]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 @@ -67,10 +65,8 @@ body: | ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[ASSERT_SEXT]] ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[ASSERT_SEXT]], [[SUB]] - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SMAX]], [[C1]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[ASHR]](s32) + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SMAX]], 16 + ; RV32ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %1:_(s32) = COPY $x10 %2:_(s32) = G_ASSERT_SEXT %1, 16 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir index f855317ad4fedc..81da754b7ecc52 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir @@ -30,14 +30,12 @@ body: | ; RV64ZBB-LABEL: name: abs_i8 ; RV64ZBB: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; RV64ZBB-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 8 - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ASSERT_ZEXT]], [[C]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C1]], [[ASHR]] - ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[ASHR]], [[SUB]] - ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 - ; RV64ZBB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SMAX]], [[C2]] + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ASSERT_ZEXT]], 8 + ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[SEXT_INREG]] + ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[SEXT_INREG]], [[SUB]] + ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; RV64ZBB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SMAX]], [[C1]] ; RV64ZBB-NEXT: $x10 = COPY [[AND]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 @@ -74,10 +72,8 @@ body: | ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[ASSERT_SEXT]] ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[ASSERT_SEXT]], [[SUB]] - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SMAX]], [[C1]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C1]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[ASHR]](s64) + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SMAX]], 16 + ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %1:_(s64) = COPY $x10 %2:_(s64) = G_ASSERT_SEXT %1, 16 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir index 3ddc0f87760dc2..4f66e8c8f5abf2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv32.mir @@ -27,17 +27,11 @@ body: | ; RV32ZBB-LABEL: name: smax_i8 ; RV32ZBB: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[ASHR]], [[ASHR1]] - ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[SMAX]], [[C2]](s32) - ; RV32ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[ASHR2]](s32) + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; RV32ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV32ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SMAX]], 8 + ; RV32ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = COPY $x11 @@ -73,17 +67,11 @@ body: | ; RV32ZBB-LABEL: name: smax_i16 ; RV32ZBB: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[ASHR]], [[ASHR1]] - ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[SMAX]], [[C2]](s32) - ; RV32ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[ASHR2]](s32) + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; RV32ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV32ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SMAX]], 16 + ; RV32ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv64.mir index 4372d4e19c36f1..6e82d1b38b3621 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smax-rv64.mir @@ -30,17 +30,11 @@ body: | ; RV64ZBB-LABEL: name: smax_i8 ; RV64ZBB: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[ASHR]], [[ASHR1]] - ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SMAX]], [[C2]](s64) - ; RV64ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[SHL2]], [[C2]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[ASHR2]](s64) + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8 + ; RV64ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 8 + ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV64ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SMAX]], 8 + ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -79,17 +73,11 @@ body: | ; RV64ZBB-LABEL: name: smax_i16 ; RV64ZBB: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[ASHR]], [[ASHR1]] - ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SMAX]], [[C2]](s64) - ; RV64ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[SHL2]], [[C2]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[ASHR2]](s64) + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 16 + ; RV64ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 16 + ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV64ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SMAX]], 16 + ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir index a8c80520376943..c02df74bd25e30 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv32.mir @@ -27,17 +27,11 @@ body: | ; RV32ZBB-LABEL: name: smin_i8 ; RV32ZBB: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[ASHR]], [[ASHR1]] - ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; RV32ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[SMIN]], [[C2]](s32) - ; RV32ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[ASHR2]](s32) + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; RV32ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV32ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SMIN]], 8 + ; RV32ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = COPY $x11 @@ -73,17 +67,11 @@ body: | ; RV32ZBB-LABEL: name: smin_i16 ; RV32ZBB: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 - ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) - ; RV32ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C1]](s32) - ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[ASHR]], [[ASHR1]] - ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; RV32ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[SMIN]], [[C2]](s32) - ; RV32ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL2]], [[C2]](s32) - ; RV32ZBB-NEXT: $x10 = COPY [[ASHR2]](s32) + ; RV32ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; RV32ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV32ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SMIN]], 16 + ; RV32ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s32) ; RV32ZBB-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 %1:_(s32) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv64.mir index 46dde69905f2d2..bb7f32bc2eaa02 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-smin-rv64.mir @@ -30,17 +30,11 @@ body: | ; RV64ZBB-LABEL: name: smin_i8 ; RV64ZBB: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[ASHR]], [[ASHR1]] - ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 - ; RV64ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SMIN]], [[C2]](s64) - ; RV64ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[SHL2]], [[C2]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[ASHR2]](s64) + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8 + ; RV64ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 8 + ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV64ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SMIN]], 8 + ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -79,17 +73,11 @@ body: | ; RV64ZBB-LABEL: name: smin_i16 ; RV64ZBB: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) - ; RV64ZBB-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) - ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C1]](s64) - ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[ASHR]], [[ASHR1]] - ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 - ; RV64ZBB-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SMIN]], [[C2]](s64) - ; RV64ZBB-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[SHL2]], [[C2]](s64) - ; RV64ZBB-NEXT: $x10 = COPY [[ASHR2]](s64) + ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 16 + ; RV64ZBB-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 16 + ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] + ; RV64ZBB-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SMIN]], 16 + ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG2]](s64) ; RV64ZBB-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-vaarg-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-vaarg-rv64.mir index 537c0fb14d260d..8b0f0916e1b8c0 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-vaarg-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-vaarg-rv64.mir @@ -4,35 +4,6 @@ # On RISC-V, the MinStackArgumentAlignment is 1 and the ABI Alignment for p0 is # greater than 1, so we will always generate code to adjust for this alignment. ---- -name: va_arg_i32 -legalized: false -tracksRegLiveness: true -fixedStack: - - { id: 0, type: default, offset: 0, size: 8, alignment: 16, - isImmutable: true, isAliased: false } -stack: - - { id: 0, type: default, offset: 0, size: 8, alignment: 8 } -machineFunctionInfo: - varArgsFrameIndex: -1 - varArgsSaveSize: 0 -body: | - bb.1: - ; CHECK-LABEL: name: va_arg_i32 - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (load (p0)) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD]], [[C]](s64) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4 - ; CHECK-NEXT: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[PTR_ADD]], [[C1]](s64) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTRMASK]], [[C2]](s64) - ; CHECK-NEXT: G_STORE [[PTR_ADD1]](p0), [[FRAME_INDEX]](p0) :: (store (p0)) - ; CHECK-NEXT: PseudoRET - %0:_(p0) = G_FRAME_INDEX %stack.0 - %1:_(s32) = G_VAARG %0(p0), 4 - PseudoRET -... --- name: va_arg_i64 legalized: false diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll index d55adf371119b5..a49d4de6e9cf0d 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll @@ -17,7 +17,7 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel -mattr=+d -target-abi lp64d \ ; RUN: -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes=RV64,LP64D %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv32 -global-isel \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -global-isel \ ; RUN: -frame-pointer=all -target-abi ilp32 -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes=RV32-WITHFP %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel \ @@ -156,7 +156,7 @@ define i32 @va1(ptr %fmt, ...) { ret i32 %1 } -define i32 @va1_va_arg(ptr %fmt, ...) nounwind { +define iXLen @va1_va_arg(ptr %fmt, ...) nounwind { ; RV32-LABEL: va1_va_arg: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -48 @@ -191,11 +191,11 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; RV64-NEXT: addi a0, sp, 24 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: lw a0, 0(a0) +; RV64-NEXT: ld a0, 0(a0) ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; @@ -241,25 +241,25 @@ define i32 @va1_va_arg(ptr %fmt, ...) nounwind { ; RV64-WITHFP-NEXT: addi a0, s0, 8 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, -24(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -24(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) +; RV64-WITHFP-NEXT: ld a0, 0(a0) ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: addi sp, sp, 96 ; RV64-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) - %1 = va_arg ptr %va, i32 + %1 = va_arg ptr %va, iXLen call void @llvm.va_end(ptr %va) - ret i32 %1 + ret iXLen %1 } ; Ensure the adjustment when restoring the stack pointer using the frame ; pointer is correct -define i32 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { +define iXLen @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; RV32-LABEL: va1_va_arg_alloca: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -48 @@ -312,14 +312,12 @@ define i32 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; RV64-NEXT: addi a0, s0, 8 ; RV64-NEXT: sd a0, -32(s0) ; RV64-NEXT: ld a0, -32(s0) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, -32(s0) -; RV64-NEXT: lw s1, 0(a0) -; RV64-NEXT: slli a0, s1, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: addi a0, a0, 15 +; RV64-NEXT: ld s1, 0(a0) +; RV64-NEXT: addi a0, s1, 15 ; RV64-NEXT: andi a0, a0, -16 ; RV64-NEXT: sub a0, sp, a0 ; RV64-NEXT: mv sp, a0 @@ -384,14 +382,12 @@ define i32 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; RV64-WITHFP-NEXT: addi a0, s0, 8 ; RV64-WITHFP-NEXT: sd a0, -32(s0) ; RV64-WITHFP-NEXT: ld a0, -32(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -32(s0) -; RV64-WITHFP-NEXT: lw s1, 0(a0) -; RV64-WITHFP-NEXT: slli a0, s1, 32 -; RV64-WITHFP-NEXT: srli a0, a0, 32 -; RV64-WITHFP-NEXT: addi a0, a0, 15 +; RV64-WITHFP-NEXT: ld s1, 0(a0) +; RV64-WITHFP-NEXT: addi a0, s1, 15 ; RV64-WITHFP-NEXT: andi a0, a0, -16 ; RV64-WITHFP-NEXT: sub a0, sp, a0 ; RV64-WITHFP-NEXT: mv sp, a0 @@ -405,11 +401,11 @@ define i32 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; RV64-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) - %1 = va_arg ptr %va, i32 - %2 = alloca i8, i32 %1 + %1 = va_arg ptr %va, iXLen + %2 = alloca i8, iXLen %1 call void @notdead(ptr %2) call void @llvm.va_end(ptr %va) - ret i32 %1 + ret iXLen %1 } define void @va1_caller() nounwind { @@ -633,9 +629,9 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV32-WITHFP-NEXT: sw a5, 20(s0) ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) -; RV32-WITHFP-NEXT: addi a0, a0, 7 -; RV32-WITHFP-NEXT: andi a1, a0, -8 -; RV32-WITHFP-NEXT: addi a0, a0, 8 +; RV32-WITHFP-NEXT: addi a1, a0, 7 +; RV32-WITHFP-NEXT: andi a1, a1, -8 +; RV32-WITHFP-NEXT: addi a0, a0, 15 ; RV32-WITHFP-NEXT: sw a0, -12(s0) ; RV32-WITHFP-NEXT: lw a0, 0(a1) ; RV32-WITHFP-NEXT: lw a1, 4(a1) @@ -686,9 +682,9 @@ define i64 @va2(ptr %fmt, ...) nounwind { } ; This test is slightly different than the SelectionDAG counterpart because -; narrowScalar and widenScalar for G_VAARG on types outside of [s32, sXLen] +; narrowScalar and widenScalar for G_VAARG on types other than sXLen ; are not implemented yet. -define i64 @va2_va_arg(ptr %fmt, ...) nounwind { +define iXLen @va2_va_arg(ptr %fmt, ...) nounwind { ; RV32-LABEL: va2_va_arg: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -48 @@ -707,7 +703,6 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: li a1, 0 ; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; @@ -724,13 +719,11 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; RV64-NEXT: addi a0, sp, 24 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: ld a0, 0(a0) ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; @@ -755,7 +748,6 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; RV32-WITHFP-NEXT: addi a1, a0, 4 ; RV32-WITHFP-NEXT: sw a1, -12(s0) ; RV32-WITHFP-NEXT: lw a0, 0(a0) -; RV32-WITHFP-NEXT: li a1, 0 ; RV32-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-WITHFP-NEXT: addi sp, sp, 48 @@ -777,23 +769,20 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind { ; RV64-WITHFP-NEXT: addi a0, s0, 8 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, -24(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -24(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) -; RV64-WITHFP-NEXT: slli a0, a0, 32 -; RV64-WITHFP-NEXT: srli a0, a0, 32 +; RV64-WITHFP-NEXT: ld a0, 0(a0) ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: addi sp, sp, 96 ; RV64-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) - %1 = va_arg ptr %va, i32 + %1 = va_arg ptr %va, iXLen call void @llvm.va_end(ptr %va) - %2 = zext i32 %1 to i64 - ret i64 %2 + ret iXLen %1 } define void @va2_caller() nounwind { @@ -987,9 +976,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV32-WITHFP-NEXT: sw a5, 12(s0) ; RV32-WITHFP-NEXT: sw a6, 16(s0) ; RV32-WITHFP-NEXT: sw a7, 20(s0) -; RV32-WITHFP-NEXT: addi a0, a0, 7 -; RV32-WITHFP-NEXT: andi a3, a0, -8 -; RV32-WITHFP-NEXT: addi a0, a0, 8 +; RV32-WITHFP-NEXT: addi a3, a0, 7 +; RV32-WITHFP-NEXT: andi a3, a3, -8 +; RV32-WITHFP-NEXT: addi a0, a0, 15 ; RV32-WITHFP-NEXT: sw a0, -12(s0) ; RV32-WITHFP-NEXT: lw a4, 0(a3) ; RV32-WITHFP-NEXT: lw a3, 4(a3) @@ -1047,26 +1036,25 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; This test is slightly different than the SelectionDAG counterpart because ; narrowScalar and widenScalar for G_VAARG on types outside of [s32, sXLen] ; are not implemented yet. -define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { +define iXLen @va3_va_arg(iXLen %a, iXLen %b, ...) nounwind { ; RV32-LABEL: va3_va_arg: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a3, 12(sp) ; RV32-NEXT: sw a4, 16(sp) ; RV32-NEXT: sw a5, 20(sp) ; RV32-NEXT: sw a6, 24(sp) ; RV32-NEXT: sw a7, 28(sp) -; RV32-NEXT: addi a0, sp, 12 +; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: sw a0, 4(sp) ; RV32-NEXT: lw a0, 4(sp) ; RV32-NEXT: addi a0, a0, 3 ; RV32-NEXT: andi a0, a0, -4 -; RV32-NEXT: addi a3, a0, 4 -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: lw a3, 0(a0) -; RV32-NEXT: add a0, a1, a3 -; RV32-NEXT: sltu a1, a0, a3 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: addi a2, a0, 4 +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; @@ -1082,13 +1070,11 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a2, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a2, a0, 8 ; RV64-NEXT: sd a2, 8(sp) -; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: ld a0, 0(a0) ; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: addi sp, sp, 64 ; RV64-NEXT: ret @@ -1099,22 +1085,21 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; RV32-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill ; RV32-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill ; RV32-WITHFP-NEXT: addi s0, sp, 24 +; RV32-WITHFP-NEXT: sw a2, 0(s0) ; RV32-WITHFP-NEXT: sw a3, 4(s0) ; RV32-WITHFP-NEXT: sw a4, 8(s0) ; RV32-WITHFP-NEXT: sw a5, 12(s0) ; RV32-WITHFP-NEXT: sw a6, 16(s0) ; RV32-WITHFP-NEXT: sw a7, 20(s0) -; RV32-WITHFP-NEXT: addi a0, s0, 4 +; RV32-WITHFP-NEXT: mv a0, s0 ; RV32-WITHFP-NEXT: sw a0, -12(s0) ; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: addi a0, a0, 3 ; RV32-WITHFP-NEXT: andi a0, a0, -4 -; RV32-WITHFP-NEXT: addi a3, a0, 4 -; RV32-WITHFP-NEXT: sw a3, -12(s0) -; RV32-WITHFP-NEXT: lw a3, 0(a0) -; RV32-WITHFP-NEXT: add a0, a1, a3 -; RV32-WITHFP-NEXT: sltu a1, a0, a3 -; RV32-WITHFP-NEXT: add a1, a2, a1 +; RV32-WITHFP-NEXT: addi a2, a0, 4 +; RV32-WITHFP-NEXT: sw a2, -12(s0) +; RV32-WITHFP-NEXT: lw a0, 0(a0) +; RV32-WITHFP-NEXT: add a0, a1, a0 ; RV32-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload ; RV32-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload ; RV32-WITHFP-NEXT: addi sp, sp, 48 @@ -1135,13 +1120,11 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; RV64-WITHFP-NEXT: mv a0, s0 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, -24(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a2, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a2, a0, 8 ; RV64-WITHFP-NEXT: sd a2, -24(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) -; RV64-WITHFP-NEXT: slli a0, a0, 32 -; RV64-WITHFP-NEXT: srli a0, a0, 32 +; RV64-WITHFP-NEXT: ld a0, 0(a0) ; RV64-WITHFP-NEXT: add a0, a1, a0 ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -1149,11 +1132,10 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; RV64-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) - %1 = va_arg ptr %va, i32 + %1 = va_arg ptr %va, iXLen call void @llvm.va_end(ptr %va) - %2 = zext i32 %1 to i64 - %3 = add i64 %b, %2 - ret i64 %3 + %3 = add iXLen %b, %1 + ret iXLen %3 } define void @va3_caller() nounwind { @@ -1222,7 +1204,7 @@ define void @va3_caller() nounwind { declare void @llvm.va_copy(ptr, ptr) -define i32 @va4_va_copy(i32 %argno, ...) nounwind { +define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV32-LABEL: va4_va_copy: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -48 @@ -1288,12 +1270,12 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-NEXT: addi a0, sp, 40 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 8(sp) ; RV64-NEXT: ld a1, 8(sp) -; RV64-NEXT: lw s0, 0(a0) +; RV64-NEXT: ld s0, 0(a0) ; RV64-NEXT: sd a1, 0(sp) ; RV64-NEXT: lw a0, 4(sp) ; RV64-NEXT: lwu a1, 0(sp) @@ -1301,26 +1283,26 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: call notdead ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 8(sp) ; RV64-NEXT: ld a1, 8(sp) -; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: addi a1, a1, 3 -; RV64-NEXT: andi a1, a1, -4 -; RV64-NEXT: addi a2, a1, 4 +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: addi a1, a1, 7 +; RV64-NEXT: andi a1, a1, -8 +; RV64-NEXT: addi a2, a1, 8 ; RV64-NEXT: sd a2, 8(sp) ; RV64-NEXT: ld a2, 8(sp) -; RV64-NEXT: lw a1, 0(a1) -; RV64-NEXT: addi a2, a2, 3 -; RV64-NEXT: andi a2, a2, -4 -; RV64-NEXT: addi a3, a2, 4 +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: addi a2, a2, 7 +; RV64-NEXT: andi a2, a2, -8 +; RV64-NEXT: addi a3, a2, 8 ; RV64-NEXT: sd a3, 8(sp) -; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: ld a2, 0(a2) ; RV64-NEXT: add a0, a0, s0 ; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: addw a0, a0, a1 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 96 @@ -1396,12 +1378,12 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-WITHFP-NEXT: addi a0, s0, 8 ; RV64-WITHFP-NEXT: sd a0, -32(s0) ; RV64-WITHFP-NEXT: ld a0, -32(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -32(s0) ; RV64-WITHFP-NEXT: ld a1, -32(s0) -; RV64-WITHFP-NEXT: lw s1, 0(a0) +; RV64-WITHFP-NEXT: ld s1, 0(a0) ; RV64-WITHFP-NEXT: sd a1, -40(s0) ; RV64-WITHFP-NEXT: lw a0, -36(s0) ; RV64-WITHFP-NEXT: lwu a1, -40(s0) @@ -1409,26 +1391,26 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-WITHFP-NEXT: or a0, a0, a1 ; RV64-WITHFP-NEXT: call notdead ; RV64-WITHFP-NEXT: ld a0, -32(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -32(s0) ; RV64-WITHFP-NEXT: ld a1, -32(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) -; RV64-WITHFP-NEXT: addi a1, a1, 3 -; RV64-WITHFP-NEXT: andi a1, a1, -4 -; RV64-WITHFP-NEXT: addi a2, a1, 4 +; RV64-WITHFP-NEXT: ld a0, 0(a0) +; RV64-WITHFP-NEXT: addi a1, a1, 7 +; RV64-WITHFP-NEXT: andi a1, a1, -8 +; RV64-WITHFP-NEXT: addi a2, a1, 8 ; RV64-WITHFP-NEXT: sd a2, -32(s0) ; RV64-WITHFP-NEXT: ld a2, -32(s0) -; RV64-WITHFP-NEXT: lw a1, 0(a1) -; RV64-WITHFP-NEXT: addi a2, a2, 3 -; RV64-WITHFP-NEXT: andi a2, a2, -4 -; RV64-WITHFP-NEXT: addi a3, a2, 4 +; RV64-WITHFP-NEXT: ld a1, 0(a1) +; RV64-WITHFP-NEXT: addi a2, a2, 7 +; RV64-WITHFP-NEXT: andi a2, a2, -8 +; RV64-WITHFP-NEXT: addi a3, a2, 8 ; RV64-WITHFP-NEXT: sd a3, -32(s0) -; RV64-WITHFP-NEXT: lw a2, 0(a2) +; RV64-WITHFP-NEXT: ld a2, 0(a2) ; RV64-WITHFP-NEXT: add a0, a0, s1 ; RV64-WITHFP-NEXT: add a1, a1, a2 -; RV64-WITHFP-NEXT: addw a0, a0, a1 +; RV64-WITHFP-NEXT: add a0, a0, a1 ; RV64-WITHFP-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1437,19 +1419,19 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { %vargs = alloca ptr %wargs = alloca ptr call void @llvm.va_start(ptr %vargs) - %1 = va_arg ptr %vargs, i32 + %1 = va_arg ptr %vargs, iXLen call void @llvm.va_copy(ptr %wargs, ptr %vargs) %2 = load ptr, ptr %wargs, align 4 call void @notdead(ptr %2) - %3 = va_arg ptr %vargs, i32 - %4 = va_arg ptr %vargs, i32 - %5 = va_arg ptr %vargs, i32 + %3 = va_arg ptr %vargs, iXLen + %4 = va_arg ptr %vargs, iXLen + %5 = va_arg ptr %vargs, iXLen call void @llvm.va_end(ptr %vargs) call void @llvm.va_end(ptr %wargs) - %add1 = add i32 %3, %1 - %add2 = add i32 %add1, %4 - %add3 = add i32 %add2, %5 - ret i32 %add3 + %add1 = add iXLen %3, %1 + %add2 = add iXLen %add1, %4 + %add3 = add iXLen %add2, %5 + ret iXLen %add3 } ; The va5_aligned_stack_callee and caller function are ommitted from this file @@ -1460,7 +1442,7 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; specified in LLVM IR. We must ensure the vararg save area is ; still set up correctly. -define i32 @va6_no_fixed_args(...) nounwind { +define iXLen @va6_no_fixed_args(...) nounwind { ; RV32-LABEL: va6_no_fixed_args: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -48 @@ -1497,11 +1479,11 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: sd a0, 8(sp) ; RV64-NEXT: ld a0, 8(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: lw a0, 0(a0) +; RV64-NEXT: ld a0, 0(a0) ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; @@ -1549,20 +1531,20 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV64-WITHFP-NEXT: mv a0, s0 ; RV64-WITHFP-NEXT: sd a0, -24(s0) ; RV64-WITHFP-NEXT: ld a0, -24(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -24(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) +; RV64-WITHFP-NEXT: ld a0, 0(a0) ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: addi sp, sp, 96 ; RV64-WITHFP-NEXT: ret %va = alloca ptr call void @llvm.va_start(ptr %va) - %1 = va_arg ptr %va, i32 + %1 = va_arg ptr %va, iXLen call void @llvm.va_end(ptr %va) - ret i32 %1 + ret iXLen %1 } ; TODO: improve constant materialization of stack addresses @@ -1742,7 +1724,7 @@ define i32 @va_large_stack(ptr %fmt, ...) { ret i32 %1 } -define i32 @va_vprintf(ptr %fmt, ptr %arg_start) { +define iXLen @va_vprintf(ptr %fmt, ptr %arg_start) { ; RV32-LABEL: va_vprintf: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 @@ -1767,11 +1749,11 @@ define i32 @va_vprintf(ptr %fmt, ptr %arg_start) { ; RV64-NEXT: ld a0, 8(sp) ; RV64-NEXT: sd a0, 0(sp) ; RV64-NEXT: ld a0, 0(sp) -; RV64-NEXT: addi a0, a0, 3 -; RV64-NEXT: andi a0, a0, -4 -; RV64-NEXT: addi a1, a0, 4 +; RV64-NEXT: addi a0, a0, 7 +; RV64-NEXT: andi a0, a0, -8 +; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 0(sp) -; RV64-NEXT: lw a0, 0(a0) +; RV64-NEXT: ld a0, 0(a0) ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; @@ -1813,11 +1795,11 @@ define i32 @va_vprintf(ptr %fmt, ptr %arg_start) { ; RV64-WITHFP-NEXT: ld a0, -24(s0) ; RV64-WITHFP-NEXT: sd a0, -32(s0) ; RV64-WITHFP-NEXT: ld a0, -32(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 3 -; RV64-WITHFP-NEXT: andi a0, a0, -4 -; RV64-WITHFP-NEXT: addi a1, a0, 4 +; RV64-WITHFP-NEXT: addi a0, a0, 7 +; RV64-WITHFP-NEXT: andi a0, a0, -8 +; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -32(s0) -; RV64-WITHFP-NEXT: lw a0, 0(a0) +; RV64-WITHFP-NEXT: ld a0, 0(a0) ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-WITHFP-NEXT: addi sp, sp, 32 @@ -1826,9 +1808,9 @@ define i32 @va_vprintf(ptr %fmt, ptr %arg_start) { %args_cp = alloca ptr store ptr %arg_start, ptr %args call void @llvm.va_copy(ptr %args_cp, ptr %args) - %width = va_arg ptr %args_cp, i32 + %width = va_arg ptr %args_cp, iXLen call void @llvm.va_end(ptr %args_cp) - ret i32 %width + ret iXLen %width } define i32 @va_printf(ptr %fmt, ...) { diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll index 99c83b99497dd3..f4e67698473151 100644 --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -1017,8 +1017,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind { ; RV32I-LABEL: sdiv8_pow2: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 24 -; RV32I-NEXT: srai a1, a1, 24 -; RV32I-NEXT: slli a1, a1, 17 +; RV32I-NEXT: srai a1, a1, 2 ; RV32I-NEXT: srli a1, a1, 29 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: slli a0, a0, 24 @@ -1028,8 +1027,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind { ; RV32IM-LABEL: sdiv8_pow2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 24 -; RV32IM-NEXT: srai a1, a1, 24 -; RV32IM-NEXT: slli a1, a1, 17 +; RV32IM-NEXT: srai a1, a1, 2 ; RV32IM-NEXT: srli a1, a1, 29 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: slli a0, a0, 24 @@ -1039,8 +1037,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind { ; RV64I-LABEL: sdiv8_pow2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 56 -; RV64I-NEXT: srai a1, a1, 56 -; RV64I-NEXT: slli a1, a1, 49 +; RV64I-NEXT: srai a1, a1, 2 ; RV64I-NEXT: srli a1, a1, 61 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 56 @@ -1050,8 +1047,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind { ; RV64IM-LABEL: sdiv8_pow2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 56 -; RV64IM-NEXT: srai a1, a1, 56 -; RV64IM-NEXT: slli a1, a1, 49 +; RV64IM-NEXT: srai a1, a1, 2 ; RV64IM-NEXT: srli a1, a1, 61 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 56 @@ -1209,8 +1205,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind { ; RV32I-LABEL: sdiv16_pow2: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: srai a1, a1, 16 -; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: srai a1, a1, 2 ; RV32I-NEXT: srli a1, a1, 29 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: slli a0, a0, 16 @@ -1220,8 +1215,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind { ; RV32IM-LABEL: sdiv16_pow2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: slli a1, a0, 16 -; RV32IM-NEXT: srai a1, a1, 16 -; RV32IM-NEXT: slli a1, a1, 1 +; RV32IM-NEXT: srai a1, a1, 2 ; RV32IM-NEXT: srli a1, a1, 29 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: slli a0, a0, 16 @@ -1231,8 +1225,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind { ; RV64I-LABEL: sdiv16_pow2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 48 -; RV64I-NEXT: srai a1, a1, 48 -; RV64I-NEXT: slli a1, a1, 33 +; RV64I-NEXT: srai a1, a1, 2 ; RV64I-NEXT: srli a1, a1, 61 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 48 @@ -1242,8 +1235,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind { ; RV64IM-LABEL: sdiv16_pow2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 48 -; RV64IM-NEXT: srai a1, a1, 48 -; RV64IM-NEXT: slli a1, a1, 33 +; RV64IM-NEXT: srai a1, a1, 2 ; RV64IM-NEXT: srli a1, a1, 61 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 48 diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 21bf6618c52a26..805ddee4ac3f6f 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -336,17 +336,23 @@ start: } declare i32 @llvm.fptoui.sat.i32.f32(float) -define i32 @fmv_x_w(float %a, float %b) nounwind { +define signext i32 @fmv_x_w(float %a, float %b) nounwind { ; CHECKIF-LABEL: fmv_x_w: ; CHECKIF: # %bb.0: ; CHECKIF-NEXT: fadd.s fa5, fa0, fa1 ; CHECKIF-NEXT: fmv.x.w a0, fa5 ; CHECKIF-NEXT: ret ; -; CHECKIZFINX-LABEL: fmv_x_w: -; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fadd.s a0, a0, a1 -; CHECKIZFINX-NEXT: ret +; RV32IZFINX-LABEL: fmv_x_w: +; RV32IZFINX: # %bb.0: +; RV32IZFINX-NEXT: fadd.s a0, a0, a1 +; RV32IZFINX-NEXT: ret +; +; RV64IZFINX-LABEL: fmv_x_w: +; RV64IZFINX: # %bb.0: +; RV64IZFINX-NEXT: fadd.s a0, a0, a1 +; RV64IZFINX-NEXT: sext.w a0, a0 +; RV64IZFINX-NEXT: ret ; ; RV32I-LABEL: fmv_x_w: ; RV32I: # %bb.0: @@ -362,6 +368,7 @@ define i32 @fmv_x_w(float %a, float %b) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: call __addsf3 +; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/riscv-codegen-prepare-atp.ll b/llvm/test/CodeGen/RISCV/riscv-codegen-prepare-atp.ll new file mode 100644 index 00000000000000..b733c6a1c787ba --- /dev/null +++ b/llvm/test/CodeGen/RISCV/riscv-codegen-prepare-atp.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes='require,function(codegenprepare)' < %s -S | FileCheck %s + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "riscv64" + +%struct.match_state = type { i64, i64 } + +; %add is also promoted by forking an extra sext. +define void @promoteTwoOne(i32 %i, i32 %j, ptr %P1, ptr %P2 ) { +; CHECK-LABEL: define void @promoteTwoOne( +; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[S2:%.*]] = sext i32 [[I]] to i64 +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i32 [[J]] to i64 +; CHECK-NEXT: [[S:%.*]] = add nsw i64 [[S2]], [[PROMOTED2]] +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[S]] +; CHECK-NEXT: store i64 [[S]], ptr [[ADDR1]], align 8 +; CHECK-NEXT: [[ADDR2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[S2]] +; CHECK-NEXT: store i64 [[S2]], ptr [[ADDR2]], align 8 +; CHECK-NEXT: ret void +; +entry: + %add = add nsw i32 %i, %j + %s = sext i32 %add to i64 + %addr1 = getelementptr inbounds i64, ptr %P1, i64 %s + store i64 %s, ptr %addr1 + %s2 = sext i32 %i to i64 + %addr2 = getelementptr inbounds i64, ptr %P2, i64 %s2 + store i64 %s2, ptr %addr2 + ret void +} + +; Both %add1 and %add2 are promoted by forking extra sexts. +define void @promoteTwoTwo(i32 %i, i32 %j, i32 %k, ptr %P1, ptr %P2) { +; CHECK-LABEL: define void @promoteTwoTwo( +; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]], i32 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PROMOTED3:%.*]] = sext i32 [[J]] to i64 +; CHECK-NEXT: [[PROMOTED4:%.*]] = sext i32 [[I]] to i64 +; CHECK-NEXT: [[S:%.*]] = add nsw i64 [[PROMOTED3]], [[PROMOTED4]] +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[S]] +; CHECK-NEXT: store i64 [[S]], ptr [[ADDR1]], align 8 +; CHECK-NEXT: [[PROMOTED2:%.*]] = sext i32 [[K]] to i64 +; CHECK-NEXT: [[S2:%.*]] = add nsw i64 [[PROMOTED3]], [[PROMOTED2]] +; CHECK-NEXT: [[ADDR2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[S2]] +; CHECK-NEXT: store i64 [[S2]], ptr [[ADDR2]], align 8 +; CHECK-NEXT: ret void +; +entry: + %add1 = add nsw i32 %j, %i + %s = sext i32 %add1 to i64 + %addr1 = getelementptr inbounds i64, ptr %P1, i64 %s + store i64 %s, ptr %addr1 + %add2 = add nsw i32 %j, %k + %s2 = sext i32 %add2 to i64 + %addr2 = getelementptr inbounds i64, ptr %P2, i64 %s2 + store i64 %s2, ptr %addr2 + ret void +} + +define i64 @promoteGEPSunk(i1 %cond, ptr %base, i32 %i) { +; CHECK-LABEL: define i64 @promoteGEPSunk( +; CHECK-SAME: i1 [[COND:%.*]], ptr [[BASE:%.*]], i32 [[I:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PROMOTED1:%.*]] = sext i32 [[I]] to i64 +; CHECK-NEXT: [[S:%.*]] = add nsw i64 [[PROMOTED1]], 1 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i64, ptr [[BASE]], i64 [[S]] +; CHECK-NEXT: [[S2:%.*]] = add nsw i64 [[PROMOTED1]], 2 +; CHECK-NEXT: [[ADDR2:%.*]] = getelementptr inbounds i64, ptr [[BASE]], i64 [[S2]] +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_THEN2:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[ADDR]], align 8 +; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[ADDR2]], align 8 +; CHECK-NEXT: [[R:%.*]] = add i64 [[V]], [[V2]] +; CHECK-NEXT: ret i64 [[R]] +; CHECK: if.then2: +; CHECK-NEXT: ret i64 0 +; +entry: + %add = add nsw i32 %i, 1 + %s = sext i32 %add to i64 + %addr = getelementptr inbounds i64, ptr %base, i64 %s + %add2 = add nsw i32 %i, 2 + %s2 = sext i32 %add2 to i64 + %addr2 = getelementptr inbounds i64, ptr %base, i64 %s2 + br i1 %cond, label %if.then, label %if.then2 +if.then: + %v = load i64, ptr %addr + %v2 = load i64, ptr %addr2 + %r = add i64 %v, %v2 + ret i64 %r +if.then2: + ret i64 0; +} diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 20a04844640188..87796e2c7b72e9 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -1555,16 +1555,14 @@ define zeroext i32 @sext_ashr_zext_i8(i8 %a) nounwind { ; RV64I-LABEL: sext_ashr_zext_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: slli a0, a0, 23 +; RV64I-NEXT: srai a0, a0, 31 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBANOZBB-LABEL: sext_ashr_zext_i8: ; RV64ZBANOZBB: # %bb.0: ; RV64ZBANOZBB-NEXT: slli a0, a0, 56 -; RV64ZBANOZBB-NEXT: srai a0, a0, 56 -; RV64ZBANOZBB-NEXT: slli a0, a0, 23 +; RV64ZBANOZBB-NEXT: srai a0, a0, 31 ; RV64ZBANOZBB-NEXT: srli a0, a0, 32 ; RV64ZBANOZBB-NEXT: ret ; @@ -1674,16 +1672,14 @@ define zeroext i32 @sext_ashr_zext_i16(i16 %a) nounwind { ; RV64I-LABEL: sext_ashr_zext_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: slli a0, a0, 23 +; RV64I-NEXT: srai a0, a0, 25 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBANOZBB-LABEL: sext_ashr_zext_i16: ; RV64ZBANOZBB: # %bb.0: ; RV64ZBANOZBB-NEXT: slli a0, a0, 48 -; RV64ZBANOZBB-NEXT: srai a0, a0, 48 -; RV64ZBANOZBB-NEXT: slli a0, a0, 23 +; RV64ZBANOZBB-NEXT: srai a0, a0, 25 ; RV64ZBANOZBB-NEXT: srli a0, a0, 32 ; RV64ZBANOZBB-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll index 08dcefa0464030..9aec4dea63b9d2 100644 --- a/llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64zfh-half-convert.ll @@ -123,6 +123,8 @@ define signext i16 @bcvt_f16_to_sext_i16(half %a, half %b) nounwind { ; RV64IZHINX-LABEL: bcvt_f16_to_sext_i16: ; RV64IZHINX: # %bb.0: ; RV64IZHINX-NEXT: fadd.h a0, a0, a1 +; RV64IZHINX-NEXT: slli a0, a0, 48 +; RV64IZHINX-NEXT: srai a0, a0, 48 ; RV64IZHINX-NEXT: ret %1 = fadd half %a, %b %2 = bitcast half %1 to i16 diff --git a/llvm/test/CodeGen/RISCV/rv64zfhmin-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64zfhmin-half-convert.ll index f867fe46f0ec33..aac1a65e6c4fec 100644 --- a/llvm/test/CodeGen/RISCV/rv64zfhmin-half-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64zfhmin-half-convert.ll @@ -144,6 +144,8 @@ define signext i16 @bcvt_f16_to_sext_i16(half %a, half %b) nounwind { ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; RV64IZHINXMIN-NEXT: fadd.s a0, a0, a1 ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: slli a0, a0, 48 +; RV64IZHINXMIN-NEXT: srai a0, a0, 48 ; RV64IZHINXMIN-NEXT: ret %1 = fadd half %a, %b %2 = bitcast half %1 to i16 diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll index 6875925adad834..f26e57b5a0b733 100644 --- a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll +++ b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll @@ -5,14 +5,11 @@ define @nxv1i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv1i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -25,14 +22,11 @@ define @nxv1i1(i1 %x, i1 %y) { define @nxv2i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv2i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -45,14 +39,11 @@ define @nxv2i1(i1 %x, i1 %y) { define @nxv4i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv4i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -65,14 +56,11 @@ define @nxv4i1(i1 %x, i1 %y) { define @nxv8i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv8i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmxor.mm v0, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -85,14 +73,11 @@ define @nxv8i1(i1 %x, i1 %y) { define @nxv16i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv16i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v10, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vmsne.vi v11, v8, 0 -; CHECK-NEXT: vmxor.mm v0, v10, v11 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -105,14 +90,11 @@ define @nxv16i1(i1 %x, i1 %y) { define @nxv32i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv32i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v12, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vmsne.vi v13, v8, 0 -; CHECK-NEXT: vmxor.mm v0, v12, v13 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -125,14 +107,11 @@ define @nxv32i1(i1 %x, i1 %y) { define @nxv64i1(i1 %x, i1 %y) { ; CHECK-LABEL: nxv64i1: ; CHECK: # %bb.0: +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmsne.vi v16, v8, 0 -; CHECK-NEXT: andi a1, a1, 1 -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vmsne.vi v17, v8, 0 -; CHECK-NEXT: vmxor.mm v0, v16, v17 +; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %head.x = insertelement poison, i1 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -145,9 +124,9 @@ define @nxv64i1(i1 %x, i1 %y) { define @nxv1i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -160,9 +139,9 @@ define @nxv1i8(i8 %x, i8 %y) { define @nxv2i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -175,9 +154,9 @@ define @nxv2i8(i8 %x, i8 %y) { define @nxv4i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -190,9 +169,9 @@ define @nxv4i8(i8 %x, i8 %y) { define @nxv8i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -205,9 +184,9 @@ define @nxv8i8(i8 %x, i8 %y) { define @nxv16i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -220,9 +199,9 @@ define @nxv16i8(i8 %x, i8 %y) { define @nxv32i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -235,9 +214,9 @@ define @nxv32i8(i8 %x, i8 %y) { define @nxv64i8(i8 %x, i8 %y) { ; CHECK-LABEL: nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i8 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -250,9 +229,9 @@ define @nxv64i8(i8 %x, i8 %y) { define @nxv1i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -265,9 +244,9 @@ define @nxv1i16(i16 %x, i16 %y) { define @nxv2i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -280,9 +259,9 @@ define @nxv2i16(i16 %x, i16 %y) { define @nxv4i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -295,9 +274,9 @@ define @nxv4i16(i16 %x, i16 %y) { define @nxv8i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -310,9 +289,9 @@ define @nxv8i16(i16 %x, i16 %y) { define @nxv16i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -325,9 +304,9 @@ define @nxv16i16(i16 %x, i16 %y) { define @nxv32i16(i16 %x, i16 %y) { ; CHECK-LABEL: nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vadd.vx v8, v8, a1 ; CHECK-NEXT: ret %head.x = insertelement poison, i16 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer @@ -338,19 +317,12 @@ define @nxv32i16(i16 %x, i16 %y) { } define @nxv1i32(i32 %x, i32 %y) { -; RV32-LABEL: nxv1i32: -; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: nxv1i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 -; RV64-NEXT: ret +; CHECK-LABEL: nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret %head.x = insertelement poison, i32 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer %head.y = insertelement poison, i32 %y, i32 0 @@ -360,19 +332,12 @@ define @nxv1i32(i32 %x, i32 %y) { } define @nxv2i32(i32 %x, i32 %y) { -; RV32-LABEL: nxv2i32: -; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: nxv2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 -; RV64-NEXT: ret +; CHECK-LABEL: nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret %head.x = insertelement poison, i32 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer %head.y = insertelement poison, i32 %y, i32 0 @@ -382,19 +347,12 @@ define @nxv2i32(i32 %x, i32 %y) { } define @nxv4i32(i32 %x, i32 %y) { -; RV32-LABEL: nxv4i32: -; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: nxv4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 -; RV64-NEXT: ret +; CHECK-LABEL: nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret %head.x = insertelement poison, i32 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer %head.y = insertelement poison, i32 %y, i32 0 @@ -404,19 +362,12 @@ define @nxv4i32(i32 %x, i32 %y) { } define @nxv8i32(i32 %x, i32 %y) { -; RV32-LABEL: nxv8i32: -; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: nxv8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 -; RV64-NEXT: ret +; CHECK-LABEL: nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret %head.x = insertelement poison, i32 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer %head.y = insertelement poison, i32 %y, i32 0 @@ -426,19 +377,12 @@ define @nxv8i32(i32 %x, i32 %y) { } define @nxv16i32(i32 %x, i32 %y) { -; RV32-LABEL: nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vadd.vx v8, v8, a1 -; RV64-NEXT: ret +; CHECK-LABEL: nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret %head.x = insertelement poison, i32 %x, i32 0 %splat.x = shufflevector %head.x, poison, zeroinitializer %head.y = insertelement poison, i32 %y, i32 0 @@ -452,16 +396,15 @@ define @nxv1i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -484,16 +427,15 @@ define @nxv2i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -516,16 +458,15 @@ define @nxv4i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -548,16 +489,15 @@ define @nxv8i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -619,3 +559,50 @@ define @nxv2f64(double %x, double %y) { %v = fadd %splat.x, %splat.y ret %v } + +define @uaddsatnxv4i8(i8 %x, i8 %y) { +; CHECK-LABEL: uaddsatnxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: ret + %head.x = insertelement poison, i8 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i8 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = call @llvm.uadd.sat.nxv4i8( %splat.x, %splat.y) + ret %v +} + +define @uaddsatnxv1i64(i64 %x, i64 %y) { +; RV32-LABEL: uaddsatnxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: sw a3, 4(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsaddu.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: uaddsatnxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vsaddu.vx v8, v8, a1 +; RV64-NEXT: ret + %head.x = insertelement poison, i64 %x, i32 0 + %splat.x = shufflevector %head.x, poison, zeroinitializer + %head.y = insertelement poison, i64 %y, i32 0 + %splat.y = shufflevector %head.y, poison, zeroinitializer + %v = call @llvm.uadd.sat.nxv4i8( %splat.x, %splat.y) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll index ee8c322961c7bd..8f40b02423094a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll @@ -461,16 +461,15 @@ define <1 x i64> @v1i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -493,17 +492,15 @@ define <2 x i64> @v2i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vadd.vv v9, v8, v9 -; RV32-NEXT: vrgather.vi v8, v9, 0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -527,17 +524,15 @@ define <4 x i64> @v4i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vadd.vv v10, v8, v10 -; RV32-NEXT: vrgather.vi v8, v10, 0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -561,17 +556,15 @@ define <8 x i64> @v8i64(i64 %x, i64 %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vadd.vv v12, v8, v12 -; RV32-NEXT: vrgather.vi v8, v12, 0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll index 27fceb0112ae32..2b141097366cfb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-sdnode.ll @@ -773,16 +773,15 @@ define @vadd_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32-LABEL: vadd_xx_nxv8i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll index 40d0d9aa9d1d6b..a84e2c984f669c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vand-sdnode.ll @@ -1224,16 +1224,13 @@ define @vand_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32-LABEL: vand_xx_nxv8i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll index 1a6d5a1d0029da..0b8620c90c62e0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32NOM ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64NOM -; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32M ; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64M define @vmul_vv_nxv1i8( %va, %vb) { @@ -864,21 +864,21 @@ define @vmul_vi_nxv8i64_2( %va) { } define @vmul_xx_nxv8i64(i64 %a, i64 %b) nounwind { -; RV32-LABEL: vmul_xx_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32NOM-LABEL: vmul_xx_nxv8i64: +; RV32NOM: # %bb.0: +; RV32NOM-NEXT: addi sp, sp, -16 +; RV32NOM-NEXT: sw a1, 12(sp) +; RV32NOM-NEXT: sw a0, 8(sp) +; RV32NOM-NEXT: addi a0, sp, 8 +; RV32NOM-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32NOM-NEXT: vlse64.v v8, (a0), zero +; RV32NOM-NEXT: sw a3, 4(sp) +; RV32NOM-NEXT: sw a2, 0(sp) +; RV32NOM-NEXT: mv a0, sp +; RV32NOM-NEXT: vlse64.v v16, (a0), zero +; RV32NOM-NEXT: vmul.vv v8, v8, v16 +; RV32NOM-NEXT: addi sp, sp, 16 +; RV32NOM-NEXT: ret ; ; RV64NOM-LABEL: vmul_xx_nxv8i64: ; RV64NOM: # %bb.0: @@ -887,6 +887,23 @@ define @vmul_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV64NOM-NEXT: vmul.vx v8, v8, a1 ; RV64NOM-NEXT: ret ; +; RV32M-LABEL: vmul_xx_nxv8i64: +; RV32M: # %bb.0: +; RV32M-NEXT: addi sp, sp, -16 +; RV32M-NEXT: mul a4, a0, a2 +; RV32M-NEXT: sw a4, 8(sp) +; RV32M-NEXT: mul a3, a0, a3 +; RV32M-NEXT: mulhu a0, a0, a2 +; RV32M-NEXT: add a0, a0, a3 +; RV32M-NEXT: mul a1, a1, a2 +; RV32M-NEXT: add a0, a0, a1 +; RV32M-NEXT: sw a0, 12(sp) +; RV32M-NEXT: addi a0, sp, 8 +; RV32M-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32M-NEXT: vlse64.v v8, (a0), zero +; RV32M-NEXT: addi sp, sp, 16 +; RV32M-NEXT: ret +; ; RV64M-LABEL: vmul_xx_nxv8i64: ; RV64M: # %bb.0: ; RV64M-NEXT: mul a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll index fbbd71cb35445f..dcfe07c1fba658 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vor-sdnode.ll @@ -1015,16 +1015,13 @@ define @vor_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32-LABEL: vor_xx_nxv8i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: or a1, a1, a3 ; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll index b7f404c8e5ac92..c2173c9a291fcf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll @@ -795,16 +795,15 @@ define @vsub_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32-LABEL: vsub_xx_nxv8i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sub a4, a0, a2 +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: sub a1, a1, a0 ; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll index 3f10b10675ca70..b03a105610dfdf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxor-sdnode.ll @@ -1224,16 +1224,13 @@ define @vxor_xx_nxv8i64(i64 %a, i64 %b) nounwind { ; RV32-LABEL: vxor_xx_nxv8i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: xor a0, a0, a2 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vxor.vv v8, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 457d0380ca8a83..dc27158cfb31f3 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -222,8 +222,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV32-LABEL: test_srem_pow2_setne: ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a0, 26 -; RV32-NEXT: srai a1, a1, 26 -; RV32-NEXT: slli a1, a1, 21 +; RV32-NEXT: srai a1, a1, 1 ; RV32-NEXT: srli a1, a1, 30 ; RV32-NEXT: add a1, a0, a1 ; RV32-NEXT: andi a1, a1, 60 @@ -235,8 +234,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64-LABEL: test_srem_pow2_setne: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 58 -; RV64-NEXT: srai a1, a1, 58 -; RV64-NEXT: slli a1, a1, 53 +; RV64-NEXT: srai a1, a1, 1 ; RV64-NEXT: srli a1, a1, 62 ; RV64-NEXT: add a1, a0, a1 ; RV64-NEXT: andi a1, a1, 60 @@ -248,8 +246,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV32M-LABEL: test_srem_pow2_setne: ; RV32M: # %bb.0: ; RV32M-NEXT: slli a1, a0, 26 -; RV32M-NEXT: srai a1, a1, 26 -; RV32M-NEXT: slli a1, a1, 21 +; RV32M-NEXT: srai a1, a1, 1 ; RV32M-NEXT: srli a1, a1, 30 ; RV32M-NEXT: add a1, a0, a1 ; RV32M-NEXT: andi a1, a1, 60 @@ -261,8 +258,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64M-LABEL: test_srem_pow2_setne: ; RV64M: # %bb.0: ; RV64M-NEXT: slli a1, a0, 58 -; RV64M-NEXT: srai a1, a1, 58 -; RV64M-NEXT: slli a1, a1, 53 +; RV64M-NEXT: srai a1, a1, 1 ; RV64M-NEXT: srli a1, a1, 62 ; RV64M-NEXT: add a1, a0, a1 ; RV64M-NEXT: andi a1, a1, 60 @@ -274,8 +270,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV32MV-LABEL: test_srem_pow2_setne: ; RV32MV: # %bb.0: ; RV32MV-NEXT: slli a1, a0, 26 -; RV32MV-NEXT: srai a1, a1, 26 -; RV32MV-NEXT: slli a1, a1, 21 +; RV32MV-NEXT: srai a1, a1, 1 ; RV32MV-NEXT: srli a1, a1, 30 ; RV32MV-NEXT: add a1, a0, a1 ; RV32MV-NEXT: andi a1, a1, 60 @@ -287,8 +282,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64MV-LABEL: test_srem_pow2_setne: ; RV64MV: # %bb.0: ; RV64MV-NEXT: slli a1, a0, 58 -; RV64MV-NEXT: srai a1, a1, 58 -; RV64MV-NEXT: slli a1, a1, 53 +; RV64MV-NEXT: srai a1, a1, 1 ; RV64MV-NEXT: srli a1, a1, 62 ; RV64MV-NEXT: add a1, a0, a1 ; RV64MV-NEXT: andi a1, a1, 60 diff --git a/llvm/test/CodeGen/SystemZ/mixed-ptr-sizes.ll b/llvm/test/CodeGen/SystemZ/mixed-ptr-sizes.ll new file mode 100644 index 00000000000000..7a0c132dcb28f0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/mixed-ptr-sizes.ll @@ -0,0 +1,375 @@ +; RUN: llc < %s -mtriple s390x-ibm-zos | FileCheck %s +; Source to regenerate: +; struct Foo { +; int * __ptr32 p32; +; int *p64; +; char *cp64; +; }; +; void use_foo(Foo *f); +; +; // Assiging a ptr32 value to a 64-bit pointer +; void ptr32_to_ptr(Foo *f, int * __ptr32 i) { +; f->p64 = i; +; use_foo(f); +; } +; +; // Assigning a 64-bit ptr value to a ptr32 +; void ptr_to_ptr32(Foo *f, int *i) { +; f->p32 = i; +; use_foo(f); +; } +; +; // Assigning a ptr32 value to a ptr32 value +; void ptr32_to_ptr32(Foo *f, int * __ptr32 i) { +; f->p32 = i; +; use_foo(f); +; } +; +; void ptr_to_ptr(Foo *f, int *i) { +; f->p64 = i; +; use_foo(f); +; } +; +; void test_indexing(Foo *f) { +; f->cp64 = ((char * __ptr32 *)1028)[1]; +; use_foo(f); +; } +; +; void test_indexing_2(Foo *f) { +; f->cp64 = ((char *** __ptr32 *)1028)[1][2][3]; +; use_foo(f); +; } +; +; unsigned long* test_misc() { +; unsigned long* x = (unsigned long*)((char***** __ptr32*)1208)[0][11][1][113][149]; +; return x; +; } +; +; char* __ptr32* __ptr32 test_misc_2() { +; static char* __ptr32* __ptr32 res = 0; +; if (res == 0) { +; res = ((char* __ptr32* __ptr32* __ptr32* __ptr32*)0)[4][136][6]; +; } +; return res; +; } +; +; unsigned short test_misc_3() { +; unsigned short this_asid = ((unsigned short*)(*(char* __ptr32*)(0x224)))[18]; +; return this_asid; +; } +; +; int test_misc_4() { +; int a = (*(int*)(80 + ((char**** __ptr32*)1208)[0][11][1][123]) > 0x040202FF); +; return a; +; } +; +; void test_misc_5(struct Foo *f) { +; f->cp64 = *(char* __ptr32 *)(PSA_PTR + PSAAOLD); +; use_foo(f); +; } +; +; int get_processor_count() { +; return ((char * __ptr32 * __ptr32 *)0)[4][165][53]; +; } +; +; void spill_ptr32_args_to_registers( char *__ptr32 p ) { +; void g ( int, ... ); +; g ( 5, p, p, p, p, p ); +; } +; +; $ clang -cc1 -triple s390x-ibm-zos -fzos-extensions -O2 -S t.cpp +; +; For the last test case: +; +;#include +; +;int foo(); +; +;typedef struct qa_area {/* Area descriptor */ +; char* __ptr32 text; /* Start address of area */ +; int length; /* Size of area in bytes */ +;} qa_area; +; +;int main() { +; qa_area* __ptr32 fap_asm_option_a = (qa_area*)__malloc31(sizeof(qa_area)); +; +; //((qa_area*)fap_asm_option_a)->length = foo(); //PASSES +; fap_asm_option_a->length = foo(); //CRASHES +; return 0; +;} + +%struct.Foo = type { ptr addrspace(1), ptr, ptr } +declare void @use_foo(ptr) + +define void @ptr32_to_ptr(ptr %f, ptr addrspace(1) %i) { +entry: +; CHECK-LABEL: ptr32_to_ptr: +; CHECK: llgtr 0, 2 +; CHECK-NEXT: stg 0, 8(1) + %0 = addrspacecast ptr addrspace(1) %i to ptr + %p64 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 1 + store ptr %0, ptr %p64, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define void @ptr_to_ptr32(ptr %f, ptr %i) { +entry: +; CHECK-LABEL: ptr_to_ptr32: +; CHECK: nilh 2, 32767 +; CHECK-NEXT: st 2, 0(1) + %0 = addrspacecast ptr %i to ptr addrspace(1) + %p32 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 0 + store ptr addrspace(1) %0, ptr %p32, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define void @ptr32_to_ptr32(ptr %f, ptr addrspace(1) %i) { +entry: +; CHECK-LABEL: ptr32_to_ptr32: +; CHECK: st 2, 0(1) + %p32 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 0 + store ptr addrspace(1) %i, ptr %p32, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define void @ptr_to_ptr(ptr %f, ptr %i) { +; CHECK-LABEL: ptr_to_ptr: +; CHECK: stg 2, 8(1) + %p64 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 1 + store ptr %i, ptr %p64, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define void @test_indexing(ptr %f) { +entry: +; CHECK-LABEL: test_indexing: +; CHECK: l 0, 1032 +; CHECK: llgtr 0, 0 +; CHECK: stg 0, 16(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 1032 to ptr), align 8 + %1 = addrspacecast ptr addrspace(1) %0 to ptr + %cp64 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 2 + store ptr %1, ptr %cp64, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define void @test_indexing_2(ptr %f) { +entry: +; CHECK-LABEL: test_indexing_2: +; CHECK: lhi 0, 16 +; CHECK-NEXT: a 0, 1032 +; CHECK-NEXT: llgtr 2, 0 +; CHECK: lg 0, 24(2) +; CHECK: stg 0, 16(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 1032 to ptr), align 8 + %arrayidx = getelementptr inbounds ptr, ptr addrspace(1) %0, i32 2 + %1 = load ptr, ptr addrspace(1) %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds ptr, ptr %1, i64 3 + %2 = bitcast ptr %arrayidx1 to ptr + %3 = load i64, ptr %2, align 8 + %cp64 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 2 + %4 = bitcast ptr %cp64 to ptr + store i64 %3, ptr %4, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define ptr @test_misc() { +entry: +; CHECK-LABEL: test_misc: +; CHECK: lhi 0, 88 +; CHECK-NEXT: a 0, 1208 +; CHECK-NEXT: llgtr 1, 0 +; CHECK-NEXT: lg 1, 0(1) +; CHECK-NEXT: lg 1, 8(1) +; CHECK-NEXT: lg 1, 904(1) +; CHECK-NEXT: lg 3, 1192(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 1208 to ptr), align 8 + %arrayidx = getelementptr inbounds ptr, ptr addrspace(1) %0, i32 11 + %1 = load ptr, ptr addrspace(1) %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds ptr, ptr %1, i64 1 + %2 = load ptr, ptr %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds ptr, ptr %2, i64 113 + %3 = load ptr, ptr %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds ptr, ptr %3, i64 149 + %4 = bitcast ptr %arrayidx3 to ptr + %5 = load ptr, ptr %4, align 8 + ret ptr %5 +} + +define ptr addrspace(1) @test_misc_2() { +entry: +; CHECK-LABEL: test_misc_2: +; CHECK: lhi 0, 544 +; CHECK: a 0, 16 +; CHECK: llgtr 1, 0 +; CHECK: lhi 0, 24 +; CHECK: a 0, 0(1) +; CHECK: llgtr 1, 0 + %0 = load ptr addrspace(1), ptr inttoptr (i64 16 to ptr), align 16 + %arrayidx = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %0, i32 136 + %1 = load ptr addrspace(1), ptr addrspace(1) %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %1, i32 6 + %2 = load ptr addrspace(1), ptr addrspace(1) %arrayidx1, align 4 + ret ptr addrspace(1) %2 +} + +define zeroext i16 @test_misc_3() { +entry: +; CHECK-LABEL: test_misc_3: +; CHECK: a 0, 548 +; CHECK-NEXT: llgtr 1, 0 +; CHECK-NEXT: llgh 3, 0(1) +; CHECK-NEXT: b 2(7) + %0 = load ptr addrspace(1), ptr inttoptr (i64 548 to ptr), align 4 + %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %0, i32 18 + %arrayidx = addrspacecast ptr addrspace(1) %arrayidx2 to ptr + %1 = load i16, ptr %arrayidx, align 2 + ret i16 %1 +} + +define signext i32 @test_misc_4() { +entry: +; CHECK-LABEL: test_misc_4: +; CHECK: lhi 0, 88 +; CHECK-NEXT: a 0, 1208 +; CHECK-NEXT: llgtr 1, 0 +; CHECK-NEXT: lg 1, 0(1) +; CHECK-NEXT: lg 1, 8(1) +; CHECK-NEXT: lg 1, 984(1) +; CHECK-NEXT: iilf 0, 67240703 +; CHECK-NEXT: c 0, 80(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 1208 to ptr), align 8 + %arrayidx = getelementptr inbounds ptr, ptr addrspace(1) %0, i32 11 + %1 = load ptr, ptr addrspace(1) %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds ptr, ptr %1, i64 1 + %2 = load ptr, ptr %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds ptr, ptr %2, i64 123 + %3 = load ptr, ptr %arrayidx2, align 8 + %add.ptr = getelementptr inbounds i8, ptr %3, i64 80 + %4 = bitcast ptr %add.ptr to ptr + %5 = load i32, ptr %4, align 4 + %cmp = icmp sgt i32 %5, 67240703 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define void @test_misc_5(ptr %f) { +entry: +; CHECK-LABEL: test_misc_5: +; CHECK: l 0, 548 +; CHECK-NEXT: lg 6, 8(5) +; CHECK-NEXT: lg 5, 0(5) +; CHECK-NEXT: llgtr 0, 0 +; CHECK-NEXT: stg 0, 16(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 548 to ptr), align 4 + %1 = addrspacecast ptr addrspace(1) %0 to ptr + %cp64 = getelementptr inbounds %struct.Foo, ptr %f, i64 0, i32 2 + store ptr %1, ptr %cp64, align 8 + tail call void @use_foo(ptr %f) + ret void +} + +define signext i32 @get_processor_count() { +entry: +; CHECK-LABEL: get_processor_count: +; CHECK: lhi 0, 660 +; CHECK-NEXT: a 0, 16 +; CHECK-NEXT: llgtr 1, 0 +; CHECK-NEXT: lhi 0, 53 +; CHECK-NEXT: a 0, 0(1) +; CHECK-NEXT: llgtr 1, 0 +; CHECK-NEXT: lgb 3, 0(1) + %0 = load ptr addrspace(1), ptr inttoptr (i64 16 to ptr), align 16 + %arrayidx = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %0, i32 165 + %1 = load ptr addrspace(1), ptr addrspace(1) %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %1, i32 53 + %2 = load i8, ptr addrspace(1) %arrayidx1, align 1 + %conv = sext i8 %2 to i32 + ret i32 %conv +} + +define void @spill_ptr32_args_to_registers(i8 addrspace(1)* %p) { +entry: +; CHECK-LABEL: spill_ptr32_args_to_registers: +; CHECK: stmg 6, 7, 1872(4) +; CHECK-NEXT: aghi 4, -192 +; CHECK-NEXT: lgr 2, 1 +; CHECK-NEXT: lg 6, 24(5) +; CHECK-NEXT: lg 5, 16(5) +; CHECK-NEXT: stg 1, 2216(4) +; CHECK-NEXT: stg 1, 2208(4) +; CHECK-NEXT: lghi 1, 5 +; CHECK-NEXT: stg 2, 2200(4) +; CHECK-NEXT: lgr 3, 2 +; CHECK-NEXT: basr 7, 6 +; CHECK-NEXT: bcr 0, 0 +; CHECK-NEXT: lg 7, 2072(4) +; CHECK-NEXT: aghi 4, 192 +; CHECK-NEXT: b 2(7) + tail call void (i32, ...) @g(i32 noundef signext 5, ptr addrspace(1) noundef %p, ptr addrspace(1) noundef %p, ptr addrspace(1) noundef %p, ptr addrspace(1) noundef %p, ptr addrspace(1) noundef %p) + ret void +} +declare void @g(i32 signext, ...) + +; The resulting instructions may look odd on first view but it is a result of +; the C code. __malloc31() returns a 64 bit pointer, thus the sequence +; +; la 1, 4(8) +; llgtr 1, 1 +; +; references the length attribute via the 64 bit pointer, and performs the +; cast to __ptr32, setting the upper 32 bit to zero. +; +define signext i32 @setlength() { +; CHECK-LABEL: setlength: +; CHECK: basr 7, 6 +; CHECK: lgr [[MALLOC:[0-9]+]], 3 +; CHECK: basr 7, 6 +; CHECK: lgr [[LENGTH:[0-9]+]], 3 +; CHECK: la [[ADDR:[0-9]+]], 4([[MALLOC]]) +; CHECK: llgtr [[ADDR]], [[ADDR]] +; CHECK: stg [[LENGTH]], 0([[ADDR]]) +entry: + %call = tail call ptr @__malloc31(i64 noundef 8) + %call1 = tail call signext i32 @foo() + %length = getelementptr inbounds i8, ptr %call, i64 4 + %0 = bitcast ptr %length to ptr + %1 = addrspacecast ptr %0 to ptr addrspace(1) + store i32 %call1, ptr addrspace(1) %1, align 4 + ret i32 0 +} + +; Same as test before, but this time calling +; extern char* __ptr32 domalloc(unsigned long); +; instead of __malloc31(). Note the different instruction sequence, because +; the function now returns a __ptr32. +; +define signext i32 @setlength2() { +; CHECK-LABEL: setlength2: +; CHECK: basr 7, 6 +; CHECK: lgr [[MALLOC:[0-9]+]], 3 +; CHECK: basr 7, 6 +; CHECK: lgr [[LENGTH:[0-9]+]], 3 +; CHECK: ahi [[MALLOC]], 4 +; CHECK: llgtr [[ADDR]], [[MALLOC]] +; CHECK: stg [[LENGTH]], 0([[ADDR]]) +entry: + %call = tail call ptr addrspace(1) @domalloc(i64 noundef 8) + %call1 = tail call signext i32 @foo() + %length = getelementptr inbounds i8, ptr addrspace(1) %call, i32 4 + %0 = bitcast ptr addrspace(1) %length to ptr addrspace(1) + store i32 %call1, ptr addrspace(1) %0, align 4 + ret i32 0 +} + +declare ptr @__malloc31(i64) + +declare signext i32 @foo(...) + +declare ptr addrspace(1) @domalloc(i64) diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll index 58bafebd5b702f..e3d65a336978b3 100644 --- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll @@ -44,8 +44,9 @@ define i1 @test_srem_even(i4 %X) nounwind { define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; CHECK-LABEL: test_srem_pow2_setne: ; CHECK: @ %bb.0: -; CHECK-NEXT: sbfx r1, r0, #0, #6 -; CHECK-NEXT: ubfx r1, r1, #9, #2 +; CHECK-NEXT: lsls r1, r0, #26 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: and.w r1, r2, r1, asr #31 ; CHECK-NEXT: add r1, r0 ; CHECK-NEXT: and r1, r1, #60 ; CHECK-NEXT: subs r0, r0, r1 diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll index 2b08f1c23b59ad..8b30473983d8c8 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -5,16 +5,11 @@ target triple = "wasm32-unknown-unknown" -;; TODO: Optimize this further by scalarizing the add - ; CHECK-LABEL: shl_add: ; CHECK-NEXT: .functype shl_add (v128, i32, i32) -> (v128) -; CHECK-NEXT: i8x16.splat $push1=, $1 -; CHECK-NEXT: i8x16.splat $push0=, $2 -; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 -; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0 -; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: i32.add $push0=, $1, $2 +; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 +; CHECK-NEXT: return $pop1 define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer @@ -44,13 +39,13 @@ define <16 x i8> @shl_abs(<16 x i8> %v, i8 %a) { ; CHECK-LABEL: shl_abs_add: ; CHECK-NEXT: .functype shl_abs_add (v128, i32, i32) -> (v128) -; CHECK-NEXT: i8x16.splat $push1=, $1 -; CHECK-NEXT: i8x16.splat $push0=, $2 -; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 -; CHECK-NEXT: i8x16.abs $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $pop3, 0 -; CHECK-NEXT: i8x16.shl $push5=, $0, $pop4 -; CHECK-NEXT: return $pop5 +; CHECK-NEXT: i32.add $push0=, $1, $2 +; CHECK-NEXT: i8x16.splat $push1=, $pop0 +; CHECK-NEXT: i8x16.abs $push2=, $pop1 +; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0 +; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3 +; CHECK-NEXT: return $pop4 + define <16 x i8> @shl_abs_add(<16 x i8> %v, i8 %a, i8 %b) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 73edceb3c3ede3..add0592661db67 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2749,12 +2749,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2766,12 +2763,9 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll b/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll index f88216f95a7614..561289c1b77465 100644 --- a/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll +++ b/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll @@ -112,13 +112,13 @@ define dso_local i32 @test_cmplxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { ; CHECK-LABEL: test_cmplxadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpnbxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe3,0x07] +; CHECK-NEXT: cmpaexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe3,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; EGPR-LABEL: test_cmplxadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpnbxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe3,0x07] +; EGPR-NEXT: cmpaexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe3,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 3) @@ -129,95 +129,95 @@ define dso_local i64 @test_cmplxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { ; CHECK-LABEL: test_cmplxadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpnbxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe3,0x07] +; CHECK-NEXT: cmpaexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe3,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; EGPR-LABEL: test_cmplxadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpnbxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe3,0x07] +; EGPR-NEXT: cmpaexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe3,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 3) ret i64 %0 } -define dso_local i32 @test_cmpnbexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpnbexadd32: +define dso_local i32 @test_cmpaxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpaxadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpzxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe4,0x07] +; CHECK-NEXT: cmpexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe4,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnbexadd32: +; EGPR-LABEL: test_cmpaxadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpzxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe4,0x07] +; EGPR-NEXT: cmpexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe4,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 4) ret i32 %0 } -define dso_local i64 @test_cmpnbexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpnbexadd64: +define dso_local i64 @test_cmpaxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpaxadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpzxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe4,0x07] +; CHECK-NEXT: cmpexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe4,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnbexadd64: +; EGPR-LABEL: test_cmpaxadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpzxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe4,0x07] +; EGPR-NEXT: cmpexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe4,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 4) ret i64 %0 } -define dso_local i32 @test_cmpnbxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpnbxadd32: +define dso_local i32 @test_cmpaexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpaexadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpnzxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe5,0x07] +; CHECK-NEXT: cmpnexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe5,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnbxadd32: +; EGPR-LABEL: test_cmpaexadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpnzxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe5,0x07] +; EGPR-NEXT: cmpnexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe5,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 5) ret i32 %0 } -define dso_local i64 @test_cmpnbxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpnbxadd64: +define dso_local i64 @test_cmpaexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpaexadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpnzxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe5,0x07] +; CHECK-NEXT: cmpnexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe5,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnbxadd64: +; EGPR-LABEL: test_cmpaexadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpnzxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe5,0x07] +; EGPR-NEXT: cmpnexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe5,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 5) ret i64 %0 } -define dso_local i32 @test_cmpnlexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpnlexadd32: +define dso_local i32 @test_cmpgxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpgxadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] ; CHECK-NEXT: cmpbexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe6,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnlexadd32: +; EGPR-LABEL: test_cmpgxadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] ; EGPR-NEXT: cmpbexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe6,0x07] @@ -227,14 +227,14 @@ entry: ret i32 %0 } -define dso_local i64 @test_cmpnlexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpnlexadd64: +define dso_local i64 @test_cmpgxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpgxadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] ; CHECK-NEXT: cmpbexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe6,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnlexadd64: +; EGPR-LABEL: test_cmpgxadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] ; EGPR-NEXT: cmpbexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe6,0x07] @@ -244,34 +244,34 @@ entry: ret i64 %0 } -define dso_local i32 @test_cmpnlxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpnlxadd32: +define dso_local i32 @test_cmpgexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpgexadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpnbexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe7,0x07] +; CHECK-NEXT: cmpaxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe7,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnlxadd32: +; EGPR-LABEL: test_cmpgexadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpnbexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe7,0x07] +; EGPR-NEXT: cmpaxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe7,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 7) ret i32 %0 } -define dso_local i64 @test_cmpnlxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpnlxadd64: +define dso_local i64 @test_cmpgexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpgexadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpnbexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe7,0x07] +; CHECK-NEXT: cmpaxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe7,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnlxadd64: +; EGPR-LABEL: test_cmpgexadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpnbexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe7,0x07] +; EGPR-NEXT: cmpaxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe7,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 7) @@ -380,14 +380,14 @@ entry: ret i64 %0 } -define dso_local i32 @test_cmpnzxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpnzxadd32: +define dso_local i32 @test_cmpnexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpnexadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] ; CHECK-NEXT: cmpnpxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xeb,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnzxadd32: +; EGPR-LABEL: test_cmpnexadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] ; EGPR-NEXT: cmpnpxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xeb,0x07] @@ -397,14 +397,14 @@ entry: ret i32 %0 } -define dso_local i64 @test_cmpnzxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpnzxadd64: +define dso_local i64 @test_cmpnexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpnexadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] ; CHECK-NEXT: cmpnpxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xeb,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpnzxadd64: +; EGPR-LABEL: test_cmpnexadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] ; EGPR-NEXT: cmpnpxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xeb,0x07] @@ -452,13 +452,13 @@ define dso_local i32 @test_cmppxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { ; CHECK-LABEL: test_cmppxadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpnlxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xed,0x07] +; CHECK-NEXT: cmpgexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xed,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; EGPR-LABEL: test_cmppxadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpnlxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xed,0x07] +; EGPR-NEXT: cmpgexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xed,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 13) @@ -469,13 +469,13 @@ define dso_local i64 @test_cmppxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { ; CHECK-LABEL: test_cmppxadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpnlxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xed,0x07] +; CHECK-NEXT: cmpgexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xed,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; EGPR-LABEL: test_cmppxadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpnlxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xed,0x07] +; EGPR-NEXT: cmpgexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xed,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 13) @@ -516,34 +516,34 @@ entry: ret i64 %0 } -define dso_local i32 @test_cmpzxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { -; CHECK-LABEL: test_cmpzxadd32: +define dso_local i32 @test_cmpexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind { +; CHECK-LABEL: test_cmpexadd32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; CHECK-NEXT: cmpnlexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xef,0x07] +; CHECK-NEXT: cmpgxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xef,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpzxadd32: +; EGPR-LABEL: test_cmpexadd32: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] -; EGPR-NEXT: cmpnlexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xef,0x07] +; EGPR-NEXT: cmpgxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xef,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 15) ret i32 %0 } -define dso_local i64 @test_cmpzxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { -; CHECK-LABEL: test_cmpzxadd64: +define dso_local i64 @test_cmpexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind { +; CHECK-LABEL: test_cmpexadd64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; CHECK-NEXT: cmpnlexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xef,0x07] +; CHECK-NEXT: cmpgxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xef,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] ; -; EGPR-LABEL: test_cmpzxadd64: +; EGPR-LABEL: test_cmpexadd64: ; EGPR: # %bb.0: # %entry ; EGPR-NEXT: movq %rsi, %rax # encoding: [0x48,0x89,0xf0] -; EGPR-NEXT: cmpnlexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xef,0x07] +; EGPR-NEXT: cmpgxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xef,0x07] ; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 15) diff --git a/llvm/test/CodeGen/X86/fp-clobbered-by-eh.ll b/llvm/test/CodeGen/X86/fp-clobbered-by-eh.ll new file mode 100644 index 00000000000000..03f227a590d5e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-clobbered-by-eh.ll @@ -0,0 +1,27 @@ +; RUN: not llc -mtriple=x86_64-unknown-unknown -stackrealign -verify-machineinstrs %s -o - 2>&1 | FileCheck %s + +declare ghccc void @may_throw_or_crash() +declare i32 @_except_handler3(...) + +define internal i64 @catchall_filt() { + ret i64 1 +} + +; If the invoked function clobbers frame pointer and/or base pointer according +; to its calling convention, we can't handle it currently, so reports an error +; message. + +; CHECK: :0: error: Frame pointer clobbered by function invoke is not supported +; CHECK: :0: error: Stack realignment in presence of dynamic allocas is not supported with this calling convention +define void @use_except_handler3() personality ptr @_except_handler3 { +entry: + invoke ghccc void @may_throw_or_crash() + to label %cont unwind label %lpad +cont: + ret void +lpad: + %cs = catchswitch within none [label %catch] unwind to caller +catch: + %p = catchpad within %cs [ptr @catchall_filt] + catchret from %p to label %cont +} diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index b40b2c82843ccd..90c1d42a929c81 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2401,109 +2401,106 @@ define void @D107009(ptr %input, ptr %output) { ; AVX1-LABEL: D107009: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovups 96(%rdi), %ymm0 -; AVX1-NEXT: vmovups (%rdi), %ymm1 -; AVX1-NEXT: vmovups 128(%rdi), %ymm2 -; AVX1-NEXT: vmovups 224(%rdi), %ymm3 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] +; AVX1-NEXT: vmovups 128(%rdi), %ymm1 +; AVX1-NEXT: vmovups 224(%rdi), %ymm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4] +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi) -; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm6, 112(%rsi) ; AVX1-NEXT: vmovups %ymm1, 128(%rsi) -; AVX1-NEXT: vmovupd %ymm5, 192(%rsi) -; AVX1-NEXT: vmovups %ymm4, 224(%rsi) -; AVX1-NEXT: vmovups %ymm3, 160(%rsi) +; AVX1-NEXT: vmovups %ymm5, 160(%rsi) +; AVX1-NEXT: vmovupd %ymm4, 192(%rsi) +; AVX1-NEXT: vmovupd %ymm3, 224(%rsi) ; AVX1-NEXT: vmovups %ymm2, 64(%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: D107009: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 64(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 128(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu 192(%rdi), %ymm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX2-NEXT: vmovdqu 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 128(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu 192(%rdi), %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi) -; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi) -; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi) -; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi) -; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi) -; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi) -; AVX2-NEXT: vmovdqu %ymm2, (%rsi) -; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %ymm7, 160(%rsi) +; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi) +; AVX2-NEXT: vmovdqu %ymm5, 224(%rsi) +; AVX2-NEXT: vmovdqu %ymm4, (%rsi) +; AVX2-NEXT: vmovdqa %xmm3, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm2, 112(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: D107009: ; XOP: # %bb.0: ; XOP-NEXT: vmovups 96(%rdi), %ymm0 -; XOP-NEXT: vmovups (%rdi), %ymm1 -; XOP-NEXT: vmovups 128(%rdi), %ymm2 -; XOP-NEXT: vmovups 224(%rdi), %ymm3 -; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] +; XOP-NEXT: vmovups 128(%rdi), %ymm1 +; XOP-NEXT: vmovups 224(%rdi), %ymm2 +; XOP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4] +; XOP-NEXT: vmovdqa 16(%rdi), %xmm2 +; XOP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 -; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm1 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] -; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3,3,3,7,7,7,7] +; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] +; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; XOP-NEXT: vmovdqa %xmm0, 16(%rsi) -; XOP-NEXT: vmovdqa %xmm7, 112(%rsi) -; XOP-NEXT: vmovdqa %xmm6, 48(%rsi) +; XOP-NEXT: vmovdqa %xmm7, 48(%rsi) +; XOP-NEXT: vmovdqa %xmm6, 112(%rsi) ; XOP-NEXT: vmovups %ymm1, 128(%rsi) -; XOP-NEXT: vmovupd %ymm5, 192(%rsi) -; XOP-NEXT: vmovups %ymm4, 224(%rsi) -; XOP-NEXT: vmovups %ymm3, 160(%rsi) +; XOP-NEXT: vmovups %ymm5, 160(%rsi) +; XOP-NEXT: vmovupd %ymm4, 192(%rsi) +; XOP-NEXT: vmovupd %ymm3, 224(%rsi) ; XOP-NEXT: vmovups %ymm2, 64(%rsi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index d644ed87c3c108..cc4bda81bef527 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -82,8 +82,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shlb $2, %cl -; X86-NEXT: sarb $5, %cl -; X86-NEXT: shrb $4, %cl +; X86-NEXT: sarb $7, %cl ; X86-NEXT: andb $3, %cl ; X86-NEXT: addb %al, %cl ; X86-NEXT: andb $60, %cl @@ -96,8 +95,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (,%rdi,4), %eax -; X64-NEXT: sarb $5, %al -; X64-NEXT: shrb $4, %al +; X64-NEXT: sarb $7, %al ; X64-NEXT: andb $3, %al ; X64-NEXT: addb %dil, %al ; X64-NEXT: andb $60, %al diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index f27619738a0eab..70164cff890729 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -57,222 +57,211 @@ define void @load_i64_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i64_stride5_vf2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vmovdqa %xmm5, (%rsi) -; AVX2-NEXT: vmovdqa %xmm1, (%rdx) -; AVX2-NEXT: vextractf128 $1, %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %xmm3, (%r8) +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-NEXT: vmovdqa %xmm4, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-NEXT: vmovaps %xmm1, (%rcx) +; AVX2-NEXT: vmovdqa %xmm5, (%r8) ; AVX2-NEXT: vmovdqa %xmm2, (%r9) -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride5_vf2: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm1, (%rdx) -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, (%rcx) -; AVX2-FP-NEXT: vmovdqa %xmm3, (%r8) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-FP-NEXT: vmovaps %xmm1, (%rcx) +; AVX2-FP-NEXT: vmovdqa %xmm5, (%r8) ; AVX2-FP-NEXT: vmovdqa %xmm2, (%r9) -; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride5_vf2: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rdx) -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm3, (%r8) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx) +; AVX2-FCP-NEXT: vmovaps %xmm1, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%r9) -; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i64_stride5_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovaps (%rdi), %ymm2 -; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) -; AVX512-NEXT: vmovdqa %xmm1, (%r9) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512-NEXT: vmovdqa %xmm1, (%r8) +; AVX512-NEXT: vmovdqa %xmm2, (%r9) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride5_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512-FCP-NEXT: vzeroupper +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8) +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r9) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride5_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm2 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) -; AVX512DQ-NEXT: vmovdqa %xmm1, (%r9) -; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8) +; AVX512DQ-NEXT: vmovdqa %xmm2, (%r9) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vzeroupper +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r9) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vmovaps (%rdi), %ymm2 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512BW-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512BW-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) -; AVX512BW-NEXT: vmovdqa %xmm1, (%r9) -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm2, (%r9) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride5_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vzeroupper +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r9) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride5_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vzeroupper +; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r9) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vzeroupper +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <10 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <10 x i64> %wide.vec, <10 x i64> poison, <2 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 13c3c6a9939c1f..64f5761b31d64f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -2261,83 +2261,70 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512-NEXT: vmovdqa (%r8), %ymm15 -; AVX512-NEXT: vmovdqa (%r9), %ymm3 -; AVX512-NEXT: vmovdqa (%r10), %ymm4 -; AVX512-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-NEXT: vmovdqa (%rax), %xmm5 -; AVX512-NEXT: vmovdqa (%r10), %xmm6 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqa (%r9), %xmm5 -; AVX512-NEXT: vmovdqa (%r8), %xmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512-NEXT: vmovdqa (%r8), %ymm9 +; AVX512-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-NEXT: vmovdqa (%r10), %ymm12 +; AVX512-NEXT: vmovdqa (%rax), %ymm13 +; AVX512-NEXT: vmovdqa (%rax), %xmm0 +; AVX512-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 +; AVX512-NEXT: movb $-86, %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 +; AVX512-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm9[4],ymm11[4],ymm9[5],ymm11[5],ymm9[6],ymm11[6],ymm9[7],ymm11[7],ymm9[12],ymm11[12],ymm9[13],ymm11[13],ymm9[14],ymm11[14],ymm9[15],ymm11[15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm9 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpermt2d %zmm0, %zmm16, %zmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] -; AVX512-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] -; AVX512-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 -; AVX512-NEXT: movb $-86, %cl -; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2345,83 +2332,70 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm15 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm12 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm13 +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 +; AVX512-FCP-NEXT: movb $-86, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm9[4],ymm11[4],ymm9[5],ymm11[5],ymm9[6],ymm11[6],ymm9[7],ymm11[7],ymm9[12],ymm11[12],ymm9[13],ymm11[13],ymm9[14],ymm11[14],ymm9[15],ymm11[15] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm9 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 -; AVX512-FCP-NEXT: movb $-86, %cl -; AVX512-FCP-NEXT: kmovw %ecx, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2429,83 +2403,70 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm6 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm12 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 +; AVX512DQ-NEXT: movb $-86, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm9[4],ymm11[4],ymm9[5],ymm11[5],ymm9[6],ymm11[6],ymm9[7],ymm11[7],ymm9[12],ymm11[12],ymm9[13],ymm11[13],ymm9[14],ymm11[14],ymm9[15],ymm11[15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm9 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 -; AVX512DQ-NEXT: movb $-86, %cl -; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2513,83 +2474,70 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: movb $-86, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm9[4],ymm11[4],ymm9[5],ymm11[5],ymm9[6],ymm11[6],ymm9[7],ymm11[7],ymm9[12],ymm11[12],ymm9[13],ymm11[13],ymm9[14],ymm11[14],ymm9[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm18, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: movb $-86, %cl -; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 8ca0e0cb971861..89642492f83a85 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -6487,7 +6487,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%r8), %ymm13 +; AVX2-NEXT: vmovaps (%r8), %ymm12 ; AVX2-NEXT: vmovaps (%r9), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rsi), %xmm3 @@ -6517,9 +6517,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -6686,8 +6687,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdx), %xmm11 @@ -6706,9 +6707,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-NEXT: vmovaps 160(%r8), %ymm9 -; AVX2-NEXT: vmovaps 160(%r9), %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-NEXT: vmovaps 160(%r8), %ymm10 +; AVX2-NEXT: vmovaps 160(%r9), %ymm9 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6743,8 +6744,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-NEXT: vmovaps 208(%rdx), %xmm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -6770,8 +6772,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rsi), %ymm3 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] @@ -6785,8 +6787,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm6 @@ -6824,14 +6826,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload @@ -6843,15 +6845,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 152(%rcx), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastsd %xmm4, %ymm4 @@ -6874,15 +6876,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 248(%r9), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovaps %ymm2, 1760(%rcx) ; AVX2-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-NEXT: vmovaps %ymm9, 1664(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6900,7 +6902,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm4, 1376(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-NEXT: vmovaps %ymm8, 1312(%rcx) +; AVX2-NEXT: vmovaps %ymm9, 1312(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6924,7 +6926,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm6, 928(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 896(%rcx) -; AVX2-NEXT: vmovaps %ymm10, 864(%rcx) +; AVX2-NEXT: vmovaps %ymm8, 864(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 832(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6935,7 +6937,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-NEXT: vmovaps %ymm13, 672(%rcx) +; AVX2-NEXT: vmovaps %ymm12, 672(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6989,7 +6991,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FP-NEXT: vmovaps (%r9), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm3 @@ -7019,9 +7021,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -7188,8 +7191,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm11 @@ -7208,9 +7211,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm9 -; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm10 +; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm9 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7245,8 +7248,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -7272,8 +7276,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm3 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] @@ -7287,8 +7291,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm6 @@ -7326,14 +7330,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload @@ -7345,15 +7349,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 152(%rcx), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-FP-NEXT: vbroadcastsd %xmm4, %ymm4 @@ -7376,15 +7380,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 248(%r9), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovaps %ymm2, 1760(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm9, 1664(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7402,7 +7406,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm4, 1376(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm8, 1312(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, 1312(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7426,7 +7430,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm6, 928(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, 864(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm8, 864(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7437,7 +7441,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm13, 672(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm12, 672(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7491,7 +7495,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FCP-NEXT: vmovaps (%r9), %ymm9 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm3 @@ -7521,9 +7525,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -7690,8 +7695,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm11 @@ -7710,9 +7715,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm10 +; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm9 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7747,8 +7752,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -7774,8 +7780,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm3 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] @@ -7789,8 +7795,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm6 @@ -7828,14 +7834,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload @@ -7847,15 +7853,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 152(%rcx), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vbroadcastsd %xmm4, %ymm4 @@ -7878,15 +7884,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 248(%r9), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm2, 1760(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 1664(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7904,7 +7910,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm4, 1376(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm8, 1312(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, 1312(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7928,7 +7934,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm6, 928(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, 864(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm8, 864(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7939,7 +7945,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm13, 672(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm12, 672(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13822,8 +13828,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 16(%rax), %xmm3 ; AVX2-NEXT: vmovaps 32(%rax), %xmm4 @@ -14055,13 +14062,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r8), %ymm4 +; AVX2-NEXT: vmovaps 208(%rdx), %xmm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vmovaps 192(%r8), %ymm5 +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%r9), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -14135,13 +14143,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 256(%r8), %ymm4 +; AVX2-NEXT: vmovaps 272(%rdx), %xmm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vmovaps 256(%r8), %ymm5 +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 256(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -14214,12 +14223,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 336(%rdx), %xmm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-NEXT: vmovaps 320(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-NEXT: vmovaps 320(%r9), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -14291,12 +14301,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 400(%rdx), %xmm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-NEXT: vmovaps 384(%r8), %ymm15 -; AVX2-NEXT: vmovaps 384(%r9), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-NEXT: vmovaps 384(%r9), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -14862,8 +14873,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm4 @@ -15095,13 +15107,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm4 +; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm1 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm5 +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -15175,13 +15188,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm4 +; AVX2-FP-NEXT: vmovaps 272(%rdx), %xmm1 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm5 +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -15254,12 +15268,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 336(%rdx), %xmm1 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FP-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -15331,12 +15346,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 400(%rdx), %xmm1 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm15 -; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -15902,8 +15918,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm4 @@ -16135,13 +16152,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm4 +; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm1 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -16215,13 +16233,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm4 +; AVX2-FCP-NEXT: vmovaps 272(%rdx), %xmm1 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -16294,12 +16313,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 336(%rdx), %xmm1 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -16371,12 +16391,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 400(%rdx), %xmm1 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm15 -; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index f5a6b9f59aacf6..311166ef60dda0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -4283,110 +4283,102 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%r11), %xmm25 -; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm11 -; AVX512BW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm25[8],xmm1[9],xmm25[9],xmm1[10],xmm25[10],xmm1[11],xmm25[11],xmm1[12],xmm25[12],xmm1[13],xmm25[13],xmm1[14],xmm25[14],xmm1[15],xmm25[15] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm15 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm18 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm22 = xmm20[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa (%r11), %xmm6 +; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm15 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm7 +; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm17 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm18 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm9 +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm19 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm23, %zmm20 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm4[1,1,1,1] +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm19[0,0,2,1,4,5,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm19[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm0 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX512BW-NEXT: vpermt2w %ymm19, %ymm22, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm4 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero +; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] +; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm25[0],xmm1[1],xmm25[1],xmm1[2],xmm25[2],xmm1[3],xmm25[3],xmm1[4],xmm25[4],xmm1[5],xmm25[5],xmm1[6],xmm25[6],xmm1[7],xmm25[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm23, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512BW-NEXT: vpermt2w %ymm4, %ymm22, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4396,73 +4388,57 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r11), %xmm7 -; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm9 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -4471,110 +4447,102 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm25[8],xmm1[9],xmm25[9],xmm1[10],xmm25[10],xmm1[11],xmm25[11],xmm1[12],xmm25[12],xmm1[13],xmm25[13],xmm1[14],xmm25[14],xmm1[15],xmm25[15] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm18 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm22 = xmm20[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm17 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm19 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm23, %zmm20 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm4[1,1,1,1] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm19[0,0,2,1,4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm19[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm0 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm19, %ymm22, %ymm4 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] +; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm25[0],xmm1[1],xmm25[1],xmm1[2],xmm25[2],xmm1[3],xmm25[3],xmm1[4],xmm25[4],xmm1[5],xmm25[5],xmm1[6],xmm25[6],xmm1[7],xmm25[7] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm4, %ymm22, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -4584,73 +4552,57 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r11), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll index aeab2a1931c2fc..a3430358e65620 100644 --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -143,11 +143,9 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm2 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = ashr <16 x i32> %a0, @@ -163,11 +161,9 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm2 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = lshr <16 x i32> %a0, @@ -184,29 +180,25 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: concat_trunc_packsswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = ashr <32 x i16> %a0, @@ -223,29 +215,25 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: concat_trunc_packuswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm0 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = lshr <32 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 56170c5c7e6996..8a6e3c244a1cb6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -5187,6 +5187,73 @@ define <4 x i64> @PR66150(ptr %b) { ret <4 x i64> %tmp6 } +define <64 x i8> @PR103564(<32 x i8> %a0, <32 x i8> %a1) { +; AVX1-LABEL: PR103564: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR103564: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: PR103564: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512VLBW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: PR103564: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VLVBMI-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VLVBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] +; AVX512VLVBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: PR103564: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; XOPAVX1-NEXT: vmovaps %ymm2, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: PR103564: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; XOPAVX2-NEXT: retq + %r = shufflevector <32 x i8> %a0, <32 x i8> %a1, <64 x i32> + ret <64 x i8> %r +} + define <32 x i8> @insert_dup_mem_v32i8_i32(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_i32: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index cb038b3211abd1..49947eddc61b9d 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1890,19 +1890,11 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind { } define <2 x i64> @PR37616(ptr %a0) nounwind { -; AVX1-LABEL: PR37616: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0] -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: PR37616: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0 -; AVX2OR512-NEXT: vunpcklpd 32(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: PR37616: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: retq %load = load <16 x i64>, ptr %a0, align 128 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> ret <2 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index e6234224a12ac3..53de286cc5cf12 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6805,7 +6805,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 9290f9f17b0532..2ea01230ca02db 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -4649,12 +4649,11 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 @@ -4849,21 +4848,17 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -5050,18 +5045,14 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -5192,12 +5183,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) @@ -5324,18 +5313,13 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5473,12 +5457,9 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index c0fa13f1a30084..70d12b2e89770d 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3743,15 +3743,14 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX-NEXT: vbroadcastss (%rdi), %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,0],ymm2[1,3],ymm3[4,4],ymm2[5,7] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,1,3] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm3, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm2, (%rdx) @@ -3897,17 +3896,15 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vbroadcastss (%rdi), %ymm2 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vbroadcastss (%rdi), %ymm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rdx) @@ -4053,10 +4050,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 @@ -4168,10 +4164,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vbroadcastss (%rdi), %ymm1 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) @@ -4274,10 +4269,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 @@ -4382,14 +4375,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) diff --git a/llvm/test/Frontend/HLSL/empty_cs_entry.ll b/llvm/test/Frontend/HLSL/empty_cs_entry.ll index 45b0faeaa44d44..32736aeeb542ce 100644 --- a/llvm/test/Frontend/HLSL/empty_cs_entry.ll +++ b/llvm/test/Frontend/HLSL/empty_cs_entry.ll @@ -1,4 +1,4 @@ -; RUN: %if directx-registered-target %{ opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXIL-CHECK %} +; RUN: %if directx-registered-target %{ opt -S -dxil-translate-metadata < %s | FileCheck %s --check-prefix=DXIL-CHECK %} ; RUN: %if spirv-registered-target %{ llc %s -mtriple=spirv-unknown-unknown -o - | FileCheck %s --check-prefix=SPIRV-CHECK %} target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" diff --git a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt index 2a54bebd5212c9..7a2e09af5b3db3 100644 --- a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt +++ b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt @@ -1,20 +1,20 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL -# ATT: cmpnbexadd %ecx, %edx, 123(%rax,%rbx,4) -# INTEL: cmpnbexadd dword ptr [rax + 4*rbx + 123], edx, ecx +# ATT: cmpaxadd %ecx, %edx, 123(%rax,%rbx,4) +# INTEL: cmpaxadd dword ptr [rax + 4*rbx + 123], edx, ecx 0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b -# ATT: cmpnbexadd %r9, %r15, 123(%rax,%rbx,4) -# INTEL: cmpnbexadd qword ptr [rax + 4*rbx + 123], r15, r9 +# ATT: cmpaxadd %r9, %r15, 123(%rax,%rbx,4) +# INTEL: cmpaxadd qword ptr [rax + 4*rbx + 123], r15, r9 0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b -# ATT: cmpnbexadd %r18d, %r22d, 291(%r28,%r29,4) -# INTEL: cmpnbexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# ATT: cmpaxadd %r18d, %r22d, 291(%r28,%r29,4) +# INTEL: cmpaxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d 0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnbexadd %r19, %r23, 291(%r28,%r29,4) -# INTEL: cmpnbexadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# ATT: cmpaxadd %r19, %r23, 291(%r28,%r29,4) +# INTEL: cmpaxadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00 # ATT: cmpbexadd %ecx, %edx, 123(%rax,%rbx,4) @@ -49,52 +49,52 @@ # INTEL: cmpbxadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpzxadd %ecx, %edx, 123(%rax,%rbx,4) -# INTEL: cmpzxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# ATT: cmpexadd %ecx, %edx, 123(%rax,%rbx,4) +# INTEL: cmpexadd dword ptr [rax + 4*rbx + 123], edx, ecx 0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b -# ATT: cmpzxadd %r9, %r15, 123(%rax,%rbx,4) -# INTEL: cmpzxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# ATT: cmpexadd %r9, %r15, 123(%rax,%rbx,4) +# INTEL: cmpexadd qword ptr [rax + 4*rbx + 123], r15, r9 0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b -# ATT: cmpzxadd %r18d, %r22d, 291(%r28,%r29,4) -# INTEL: cmpzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# ATT: cmpexadd %r18d, %r22d, 291(%r28,%r29,4) +# INTEL: cmpexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d 0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpzxadd %r19, %r23, 291(%r28,%r29,4) -# INTEL: cmpzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# ATT: cmpexadd %r19, %r23, 291(%r28,%r29,4) +# INTEL: cmpexadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnlxadd %ecx, %edx, 123(%rax,%rbx,4) -# INTEL: cmpnlxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# ATT: cmpgexadd %ecx, %edx, 123(%rax,%rbx,4) +# INTEL: cmpgexadd dword ptr [rax + 4*rbx + 123], edx, ecx 0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b -# ATT: cmpnlxadd %r9, %r15, 123(%rax,%rbx,4) -# INTEL: cmpnlxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# ATT: cmpgexadd %r9, %r15, 123(%rax,%rbx,4) +# INTEL: cmpgexadd qword ptr [rax + 4*rbx + 123], r15, r9 0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b -# ATT: cmpnlxadd %r18d, %r22d, 291(%r28,%r29,4) -# INTEL: cmpnlxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# ATT: cmpgexadd %r18d, %r22d, 291(%r28,%r29,4) +# INTEL: cmpgexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d 0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnlxadd %r19, %r23, 291(%r28,%r29,4) -# INTEL: cmpnlxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# ATT: cmpgexadd %r19, %r23, 291(%r28,%r29,4) +# INTEL: cmpgexadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnlexadd %ecx, %edx, 123(%rax,%rbx,4) -# INTEL: cmpnlexadd dword ptr [rax + 4*rbx + 123], edx, ecx +# ATT: cmpgxadd %ecx, %edx, 123(%rax,%rbx,4) +# INTEL: cmpgxadd dword ptr [rax + 4*rbx + 123], edx, ecx 0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b -# ATT: cmpnlexadd %r9, %r15, 123(%rax,%rbx,4) -# INTEL: cmpnlexadd qword ptr [rax + 4*rbx + 123], r15, r9 +# ATT: cmpgxadd %r9, %r15, 123(%rax,%rbx,4) +# INTEL: cmpgxadd qword ptr [rax + 4*rbx + 123], r15, r9 0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b -# ATT: cmpnlexadd %r18d, %r22d, 291(%r28,%r29,4) -# INTEL: cmpnlexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# ATT: cmpgxadd %r18d, %r22d, 291(%r28,%r29,4) +# INTEL: cmpgxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d 0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnlexadd %r19, %r23, 291(%r28,%r29,4) -# INTEL: cmpnlexadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# ATT: cmpgxadd %r19, %r23, 291(%r28,%r29,4) +# INTEL: cmpgxadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00 # ATT: cmplexadd %ecx, %edx, 123(%rax,%rbx,4) @@ -129,20 +129,20 @@ # INTEL: cmplxadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnzxadd %ecx, %edx, 123(%rax,%rbx,4) -# INTEL: cmpnzxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# ATT: cmpnexadd %ecx, %edx, 123(%rax,%rbx,4) +# INTEL: cmpnexadd dword ptr [rax + 4*rbx + 123], edx, ecx 0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b -# ATT: cmpnzxadd %r9, %r15, 123(%rax,%rbx,4) -# INTEL: cmpnzxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# ATT: cmpnexadd %r9, %r15, 123(%rax,%rbx,4) +# INTEL: cmpnexadd qword ptr [rax + 4*rbx + 123], r15, r9 0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b -# ATT: cmpnzxadd %r18d, %r22d, 291(%r28,%r29,4) -# INTEL: cmpnzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# ATT: cmpnexadd %r18d, %r22d, 291(%r28,%r29,4) +# INTEL: cmpnexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d 0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00 -# ATT: cmpnzxadd %r19, %r23, 291(%r28,%r29,4) -# INTEL: cmpnzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# ATT: cmpnexadd %r19, %r23, 291(%r28,%r29,4) +# INTEL: cmpnexadd qword ptr [r28 + 4*r29 + 291], r23, r19 0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00 # ATT: cmpnoxadd %ecx, %edx, 123(%rax,%rbx,4) diff --git a/llvm/test/MC/Disassembler/X86/cmpccxadd-64.txt b/llvm/test/MC/Disassembler/X86/cmpccxadd-64.txt index 62420db37f40d7..7b1599de263263 100644 --- a/llvm/test/MC/Disassembler/X86/cmpccxadd-64.txt +++ b/llvm/test/MC/Disassembler/X86/cmpccxadd-64.txt @@ -193,196 +193,196 @@ # INTEL: cmplxadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xec,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpnbexadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpnbexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpaxadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpaxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnbexadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpnbexadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpaxadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpaxadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnbexadd %eax, %ecx, (%rip) -# INTEL: cmpnbexadd dword ptr [rip], ecx, eax +# ATT: cmpaxadd %eax, %ecx, (%rip) +# INTEL: cmpaxadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xe7,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnbexadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpnbexadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpaxadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpaxadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xe7,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpnbexadd %eax, %ecx, 508(%rcx) -# INTEL: cmpnbexadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpaxadd %eax, %ecx, 508(%rcx) +# INTEL: cmpaxadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xe7,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpnbexadd %eax, %ecx, -512(%rdx) -# INTEL: cmpnbexadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpaxadd %eax, %ecx, -512(%rdx) +# INTEL: cmpaxadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xe7,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpnbexadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpnbexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpaxadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpaxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnbexadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpnbexadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpaxadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpaxadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnbexadd %r10, %r9, (%rip) -# INTEL: cmpnbexadd qword ptr [rip], r9, r10 +# ATT: cmpaxadd %r10, %r9, (%rip) +# INTEL: cmpaxadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xe7,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnbexadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpnbexadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpaxadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpaxadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xe7,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpnbexadd %r10, %r9, 1016(%rcx) -# INTEL: cmpnbexadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpaxadd %r10, %r9, 1016(%rcx) +# INTEL: cmpaxadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xe7,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpnbexadd %r10, %r9, -1024(%rdx) -# INTEL: cmpnbexadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpaxadd %r10, %r9, -1024(%rdx) +# INTEL: cmpaxadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe7,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpnbxadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpnbxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpaexadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpaexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnbxadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpnbxadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpaexadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpaexadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnbxadd %eax, %ecx, (%rip) -# INTEL: cmpnbxadd dword ptr [rip], ecx, eax +# ATT: cmpaexadd %eax, %ecx, (%rip) +# INTEL: cmpaexadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnbxadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpnbxadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpaexadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpaexadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xe3,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpnbxadd %eax, %ecx, 508(%rcx) -# INTEL: cmpnbxadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpaexadd %eax, %ecx, 508(%rcx) +# INTEL: cmpaexadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xe3,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpnbxadd %eax, %ecx, -512(%rdx) -# INTEL: cmpnbxadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpaexadd %eax, %ecx, -512(%rdx) +# INTEL: cmpaexadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xe3,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpnbxadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpnbxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpaexadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpaexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnbxadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpnbxadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpaexadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpaexadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnbxadd %r10, %r9, (%rip) -# INTEL: cmpnbxadd qword ptr [rip], r9, r10 +# ATT: cmpaexadd %r10, %r9, (%rip) +# INTEL: cmpaexadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xe3,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnbxadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpnbxadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpaexadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpaexadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xe3,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpnbxadd %r10, %r9, 1016(%rcx) -# INTEL: cmpnbxadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpaexadd %r10, %r9, 1016(%rcx) +# INTEL: cmpaexadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xe3,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpnbxadd %r10, %r9, -1024(%rdx) -# INTEL: cmpnbxadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpaexadd %r10, %r9, -1024(%rdx) +# INTEL: cmpaexadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe3,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpnlexadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpnlexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpgxadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpgxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnlexadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpnlexadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpgxadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpgxadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xef,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnlexadd %eax, %ecx, (%rip) -# INTEL: cmpnlexadd dword ptr [rip], ecx, eax +# ATT: cmpgxadd %eax, %ecx, (%rip) +# INTEL: cmpgxadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnlexadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpnlexadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpgxadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpgxadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xef,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpnlexadd %eax, %ecx, 508(%rcx) -# INTEL: cmpnlexadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpgxadd %eax, %ecx, 508(%rcx) +# INTEL: cmpgxadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xef,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpnlexadd %eax, %ecx, -512(%rdx) -# INTEL: cmpnlexadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpgxadd %eax, %ecx, -512(%rdx) +# INTEL: cmpgxadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xef,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpnlexadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpnlexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpgxadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpgxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnlexadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpnlexadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpgxadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpgxadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xef,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnlexadd %r10, %r9, (%rip) -# INTEL: cmpnlexadd qword ptr [rip], r9, r10 +# ATT: cmpgxadd %r10, %r9, (%rip) +# INTEL: cmpgxadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xef,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnlexadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpnlexadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpgxadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpgxadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xef,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpnlexadd %r10, %r9, 1016(%rcx) -# INTEL: cmpnlexadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpgxadd %r10, %r9, 1016(%rcx) +# INTEL: cmpgxadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xef,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpnlexadd %r10, %r9, -1024(%rdx) -# INTEL: cmpnlexadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpgxadd %r10, %r9, -1024(%rdx) +# INTEL: cmpgxadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xef,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpnlxadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpnlxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpgexadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpgexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnlxadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpnlxadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpgexadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpgexadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xed,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnlxadd %eax, %ecx, (%rip) -# INTEL: cmpnlxadd dword ptr [rip], ecx, eax +# ATT: cmpgexadd %eax, %ecx, (%rip) +# INTEL: cmpgexadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xed,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnlxadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpnlxadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpgexadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpgexadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xed,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpnlxadd %eax, %ecx, 508(%rcx) -# INTEL: cmpnlxadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpgexadd %eax, %ecx, 508(%rcx) +# INTEL: cmpgexadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xed,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpnlxadd %eax, %ecx, -512(%rdx) -# INTEL: cmpnlxadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpgexadd %eax, %ecx, -512(%rdx) +# INTEL: cmpgexadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xed,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpnlxadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpnlxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpgexadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpgexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnlxadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpnlxadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpgexadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpgexadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xed,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnlxadd %r10, %r9, (%rip) -# INTEL: cmpnlxadd qword ptr [rip], r9, r10 +# ATT: cmpgexadd %r10, %r9, (%rip) +# INTEL: cmpgexadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xed,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnlxadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpnlxadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpgexadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpgexadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xed,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpnlxadd %r10, %r9, 1016(%rcx) -# INTEL: cmpnlxadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpgexadd %r10, %r9, 1016(%rcx) +# INTEL: cmpgexadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xed,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpnlxadd %r10, %r9, -1024(%rdx) -# INTEL: cmpnlxadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpgexadd %r10, %r9, -1024(%rdx) +# INTEL: cmpgexadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xed,0x8a,0x00,0xfc,0xff,0xff # ATT: cmpnoxadd %eax, %ecx, 268435456(%rbp,%r14,8) @@ -529,52 +529,52 @@ # INTEL: cmpnsxadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe9,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpnzxadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpnzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpnexadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpnexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnzxadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpnzxadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpnexadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpnexadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnzxadd %eax, %ecx, (%rip) -# INTEL: cmpnzxadd dword ptr [rip], ecx, eax +# ATT: cmpnexadd %eax, %ecx, (%rip) +# INTEL: cmpnexadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xe5,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnzxadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpnzxadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpnexadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpnexadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xe5,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpnzxadd %eax, %ecx, 508(%rcx) -# INTEL: cmpnzxadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpnexadd %eax, %ecx, 508(%rcx) +# INTEL: cmpnexadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xe5,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpnzxadd %eax, %ecx, -512(%rdx) -# INTEL: cmpnzxadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpnexadd %eax, %ecx, -512(%rdx) +# INTEL: cmpnexadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xe5,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpnzxadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpnzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpnexadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpnexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpnzxadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpnzxadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpnexadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpnexadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpnzxadd %r10, %r9, (%rip) -# INTEL: cmpnzxadd qword ptr [rip], r9, r10 +# ATT: cmpnexadd %r10, %r9, (%rip) +# INTEL: cmpnexadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xe5,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpnzxadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpnzxadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpnexadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpnexadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xe5,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpnzxadd %r10, %r9, 1016(%rcx) -# INTEL: cmpnzxadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpnexadd %r10, %r9, 1016(%rcx) +# INTEL: cmpnexadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xe5,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpnzxadd %r10, %r9, -1024(%rdx) -# INTEL: cmpnzxadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpnexadd %r10, %r9, -1024(%rdx) +# INTEL: cmpnexadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe5,0x8a,0x00,0xfc,0xff,0xff # ATT: cmpoxadd %eax, %ecx, 268435456(%rbp,%r14,8) @@ -721,52 +721,52 @@ # INTEL: cmpsxadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe8,0x8a,0x00,0xfc,0xff,0xff -# ATT: cmpzxadd %eax, %ecx, 268435456(%rbp,%r14,8) -# INTEL: cmpzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +# ATT: cmpexadd %eax, %ecx, 268435456(%rbp,%r14,8) +# INTEL: cmpexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax 0xc4,0xa2,0x79,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpzxadd %eax, %ecx, 291(%r8,%rax,4) -# INTEL: cmpzxadd dword ptr [r8 + 4*rax + 291], ecx, eax +# ATT: cmpexadd %eax, %ecx, 291(%r8,%rax,4) +# INTEL: cmpexadd dword ptr [r8 + 4*rax + 291], ecx, eax 0xc4,0xc2,0x79,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpzxadd %eax, %ecx, (%rip) -# INTEL: cmpzxadd dword ptr [rip], ecx, eax +# ATT: cmpexadd %eax, %ecx, (%rip) +# INTEL: cmpexadd dword ptr [rip], ecx, eax 0xc4,0xe2,0x79,0xe4,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpzxadd %eax, %ecx, -128(,%rbp,2) -# INTEL: cmpzxadd dword ptr [2*rbp - 128], ecx, eax +# ATT: cmpexadd %eax, %ecx, -128(,%rbp,2) +# INTEL: cmpexadd dword ptr [2*rbp - 128], ecx, eax 0xc4,0xe2,0x79,0xe4,0x0c,0x6d,0x80,0xff,0xff,0xff -# ATT: cmpzxadd %eax, %ecx, 508(%rcx) -# INTEL: cmpzxadd dword ptr [rcx + 508], ecx, eax +# ATT: cmpexadd %eax, %ecx, 508(%rcx) +# INTEL: cmpexadd dword ptr [rcx + 508], ecx, eax 0xc4,0xe2,0x79,0xe4,0x89,0xfc,0x01,0x00,0x00 -# ATT: cmpzxadd %eax, %ecx, -512(%rdx) -# INTEL: cmpzxadd dword ptr [rdx - 512], ecx, eax +# ATT: cmpexadd %eax, %ecx, -512(%rdx) +# INTEL: cmpexadd dword ptr [rdx - 512], ecx, eax 0xc4,0xe2,0x79,0xe4,0x8a,0x00,0xfe,0xff,0xff -# ATT: cmpzxadd %r10, %r9, 268435456(%rbp,%r14,8) -# INTEL: cmpzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +# ATT: cmpexadd %r10, %r9, 268435456(%rbp,%r14,8) +# INTEL: cmpexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 0xc4,0x22,0xa9,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10 -# ATT: cmpzxadd %r10, %r9, 291(%r8,%rax,4) -# INTEL: cmpzxadd qword ptr [r8 + 4*rax + 291], r9, r10 +# ATT: cmpexadd %r10, %r9, 291(%r8,%rax,4) +# INTEL: cmpexadd qword ptr [r8 + 4*rax + 291], r9, r10 0xc4,0x42,0xa9,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00 -# ATT: cmpzxadd %r10, %r9, (%rip) -# INTEL: cmpzxadd qword ptr [rip], r9, r10 +# ATT: cmpexadd %r10, %r9, (%rip) +# INTEL: cmpexadd qword ptr [rip], r9, r10 0xc4,0x62,0xa9,0xe4,0x0d,0x00,0x00,0x00,0x00 -# ATT: cmpzxadd %r10, %r9, -256(,%rbp,2) -# INTEL: cmpzxadd qword ptr [2*rbp - 256], r9, r10 +# ATT: cmpexadd %r10, %r9, -256(,%rbp,2) +# INTEL: cmpexadd qword ptr [2*rbp - 256], r9, r10 0xc4,0x62,0xa9,0xe4,0x0c,0x6d,0x00,0xff,0xff,0xff -# ATT: cmpzxadd %r10, %r9, 1016(%rcx) -# INTEL: cmpzxadd qword ptr [rcx + 1016], r9, r10 +# ATT: cmpexadd %r10, %r9, 1016(%rcx) +# INTEL: cmpexadd qword ptr [rcx + 1016], r9, r10 0xc4,0x62,0xa9,0xe4,0x89,0xf8,0x03,0x00,0x00 -# ATT: cmpzxadd %r10, %r9, -1024(%rdx) -# INTEL: cmpzxadd qword ptr [rdx - 1024], r9, r10 +# ATT: cmpexadd %r10, %r9, -1024(%rdx) +# INTEL: cmpexadd qword ptr [rdx - 1024], r9, r10 0xc4,0x62,0xa9,0xe4,0x8a,0x00,0xfc,0xff,0xff # ATT: cmpbexadd %ecx, %r8d, (%rip) diff --git a/llvm/test/MC/X86/apx/cmpccxadd-att.s b/llvm/test/MC/X86/apx/cmpccxadd-att.s index d6ade869ca1d26..544871274a41d1 100644 --- a/llvm/test/MC/X86/apx/cmpccxadd-att.s +++ b/llvm/test/MC/X86/apx/cmpccxadd-att.s @@ -3,21 +3,21 @@ # ERROR-COUNT-60: error: # ERROR-NOT: error: -# CHECK: {evex} cmpnbexadd %ecx, %edx, 123(%eax,%ebx,4) +# CHECK: {evex} cmpaxadd %ecx, %edx, 123(%eax,%ebx,4) # CHECK: encoding: [0x67,0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b] - {evex} cmpnbexadd %ecx, %edx, 123(%eax,%ebx,4) + {evex} cmpaxadd %ecx, %edx, 123(%eax,%ebx,4) -# CHECK: {evex} cmpnbexadd %r9, %r15, 123(%rax,%rbx,4) +# CHECK: {evex} cmpaxadd %r9, %r15, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b] - {evex} cmpnbexadd %r9, %r15, 123(%rax,%rbx,4) + {evex} cmpaxadd %r9, %r15, 123(%rax,%rbx,4) -# CHECK: cmpnbexadd %r18d, %r22d, 291(%r28,%r29,4) +# CHECK: cmpaxadd %r18d, %r22d, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnbexadd %r18d, %r22d, 291(%r28,%r29,4) + cmpaxadd %r18d, %r22d, 291(%r28,%r29,4) -# CHECK: cmpnbexadd %r19, %r23, 291(%r28,%r29,4) +# CHECK: cmpaxadd %r19, %r23, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnbexadd %r19, %r23, 291(%r28,%r29,4) + cmpaxadd %r19, %r23, 291(%r28,%r29,4) # CHECK: {evex} cmpbexadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b] @@ -51,53 +51,53 @@ # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00] cmpbxadd %r19, %r23, 291(%r28,%r29,4) -# CHECK: {evex} cmpzxadd %ecx, %edx, 123(%rax,%rbx,4) +# CHECK: {evex} cmpexadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b] - {evex} cmpzxadd %ecx, %edx, 123(%rax,%rbx,4) + {evex} cmpexadd %ecx, %edx, 123(%rax,%rbx,4) -# CHECK: {evex} cmpzxadd %r9, %r15, 123(%rax,%rbx,4) +# CHECK: {evex} cmpexadd %r9, %r15, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b] - {evex} cmpzxadd %r9, %r15, 123(%rax,%rbx,4) + {evex} cmpexadd %r9, %r15, 123(%rax,%rbx,4) -# CHECK: cmpzxadd %r18d, %r22d, 291(%r28,%r29,4) +# CHECK: cmpexadd %r18d, %r22d, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpzxadd %r18d, %r22d, 291(%r28,%r29,4) + cmpexadd %r18d, %r22d, 291(%r28,%r29,4) -# CHECK: cmpzxadd %r19, %r23, 291(%r28,%r29,4) +# CHECK: cmpexadd %r19, %r23, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpzxadd %r19, %r23, 291(%r28,%r29,4) + cmpexadd %r19, %r23, 291(%r28,%r29,4) -# CHECK: {evex} cmpnlxadd %ecx, %edx, 123(%rax,%rbx,4) +# CHECK: {evex} cmpgexadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b] - {evex} cmpnlxadd %ecx, %edx, 123(%rax,%rbx,4) + {evex} cmpgexadd %ecx, %edx, 123(%rax,%rbx,4) -# CHECK: {evex} cmpnlxadd %r9, %r15, 123(%rax,%rbx,4) +# CHECK: {evex} cmpgexadd %r9, %r15, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b] - {evex} cmpnlxadd %r9, %r15, 123(%rax,%rbx,4) + {evex} cmpgexadd %r9, %r15, 123(%rax,%rbx,4) -# CHECK: cmpnlxadd %r18d, %r22d, 291(%r28,%r29,4) +# CHECK: cmpgexadd %r18d, %r22d, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnlxadd %r18d, %r22d, 291(%r28,%r29,4) + cmpgexadd %r18d, %r22d, 291(%r28,%r29,4) -# CHECK: cmpnlxadd %r19, %r23, 291(%r28,%r29,4) +# CHECK: cmpgexadd %r19, %r23, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnlxadd %r19, %r23, 291(%r28,%r29,4) + cmpgexadd %r19, %r23, 291(%r28,%r29,4) -# CHECK: {evex} cmpnlexadd %ecx, %edx, 123(%rax,%rbx,4) +# CHECK: {evex} cmpgxadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b] - {evex} cmpnlexadd %ecx, %edx, 123(%rax,%rbx,4) + {evex} cmpgxadd %ecx, %edx, 123(%rax,%rbx,4) -# CHECK: {evex} cmpnlexadd %r9, %r15, 123(%rax,%rbx,4) +# CHECK: {evex} cmpgxadd %r9, %r15, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b] - {evex} cmpnlexadd %r9, %r15, 123(%rax,%rbx,4) + {evex} cmpgxadd %r9, %r15, 123(%rax,%rbx,4) -# CHECK: cmpnlexadd %r18d, %r22d, 291(%r28,%r29,4) +# CHECK: cmpgxadd %r18d, %r22d, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnlexadd %r18d, %r22d, 291(%r28,%r29,4) + cmpgxadd %r18d, %r22d, 291(%r28,%r29,4) -# CHECK: cmpnlexadd %r19, %r23, 291(%r28,%r29,4) +# CHECK: cmpgxadd %r19, %r23, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnlexadd %r19, %r23, 291(%r28,%r29,4) + cmpgxadd %r19, %r23, 291(%r28,%r29,4) # CHECK: {evex} cmplexadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b] @@ -131,21 +131,21 @@ # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00] cmplxadd %r19, %r23, 291(%r28,%r29,4) -# CHECK: {evex} cmpnzxadd %ecx, %edx, 123(%rax,%rbx,4) +# CHECK: {evex} cmpnexadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b] - {evex} cmpnzxadd %ecx, %edx, 123(%rax,%rbx,4) + {evex} cmpnexadd %ecx, %edx, 123(%rax,%rbx,4) -# CHECK: {evex} cmpnzxadd %r9, %r15, 123(%rax,%rbx,4) +# CHECK: {evex} cmpnexadd %r9, %r15, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b] - {evex} cmpnzxadd %r9, %r15, 123(%rax,%rbx,4) + {evex} cmpnexadd %r9, %r15, 123(%rax,%rbx,4) -# CHECK: cmpnzxadd %r18d, %r22d, 291(%r28,%r29,4) +# CHECK: cmpnexadd %r18d, %r22d, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnzxadd %r18d, %r22d, 291(%r28,%r29,4) + cmpnexadd %r18d, %r22d, 291(%r28,%r29,4) -# CHECK: cmpnzxadd %r19, %r23, 291(%r28,%r29,4) +# CHECK: cmpnexadd %r19, %r23, 291(%r28,%r29,4) # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnzxadd %r19, %r23, 291(%r28,%r29,4) + cmpnexadd %r19, %r23, 291(%r28,%r29,4) # CHECK: {evex} cmpnoxadd %ecx, %edx, 123(%rax,%rbx,4) # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b] diff --git a/llvm/test/MC/X86/apx/cmpccxadd-intel.s b/llvm/test/MC/X86/apx/cmpccxadd-intel.s index 4c44968fbf91ce..cace33e59d6a74 100644 --- a/llvm/test/MC/X86/apx/cmpccxadd-intel.s +++ b/llvm/test/MC/X86/apx/cmpccxadd-intel.s @@ -1,20 +1,20 @@ # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -# CHECK: {evex} cmpnbexadd dword ptr [rax + 4*rbx + 123], edx, ecx +# CHECK: {evex} cmpaxadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b] - {evex} cmpnbexadd dword ptr [rax + 4*rbx + 123], edx, ecx + {evex} cmpaxadd dword ptr [rax + 4*rbx + 123], edx, ecx -# CHECK: {evex} cmpnbexadd qword ptr [rax + 4*rbx + 123], r15, r9 +# CHECK: {evex} cmpaxadd qword ptr [rax + 4*rbx + 123], r15, r9 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b] - {evex} cmpnbexadd qword ptr [rax + 4*rbx + 123], r15, r9 + {evex} cmpaxadd qword ptr [rax + 4*rbx + 123], r15, r9 -# CHECK: cmpnbexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# CHECK: cmpaxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnbexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d + cmpaxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d -# CHECK: cmpnbexadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# CHECK: cmpaxadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnbexadd qword ptr [r28 + 4*r29 + 291], r23, r19 + cmpaxadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: {evex} cmpbexadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b] @@ -48,53 +48,53 @@ # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00] cmpbxadd qword ptr [r28 + 4*r29 + 291], r23, r19 -# CHECK: {evex} cmpzxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# CHECK: {evex} cmpexadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b] - {evex} cmpzxadd dword ptr [rax + 4*rbx + 123], edx, ecx + {evex} cmpexadd dword ptr [rax + 4*rbx + 123], edx, ecx -# CHECK: {evex} cmpzxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# CHECK: {evex} cmpexadd qword ptr [rax + 4*rbx + 123], r15, r9 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b] - {evex} cmpzxadd qword ptr [rax + 4*rbx + 123], r15, r9 + {evex} cmpexadd qword ptr [rax + 4*rbx + 123], r15, r9 -# CHECK: cmpzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# CHECK: cmpexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d + cmpexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d -# CHECK: cmpzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# CHECK: cmpexadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 + cmpexadd qword ptr [r28 + 4*r29 + 291], r23, r19 -# CHECK: {evex} cmpnlxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# CHECK: {evex} cmpgexadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b] - {evex} cmpnlxadd dword ptr [rax + 4*rbx + 123], edx, ecx + {evex} cmpgexadd dword ptr [rax + 4*rbx + 123], edx, ecx -# CHECK: {evex} cmpnlxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# CHECK: {evex} cmpgexadd qword ptr [rax + 4*rbx + 123], r15, r9 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b] - {evex} cmpnlxadd qword ptr [rax + 4*rbx + 123], r15, r9 + {evex} cmpgexadd qword ptr [rax + 4*rbx + 123], r15, r9 -# CHECK: cmpnlxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# CHECK: cmpgexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnlxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d + cmpgexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d -# CHECK: cmpnlxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# CHECK: cmpgexadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnlxadd qword ptr [r28 + 4*r29 + 291], r23, r19 + cmpgexadd qword ptr [r28 + 4*r29 + 291], r23, r19 -# CHECK: {evex} cmpnlexadd dword ptr [rax + 4*rbx + 123], edx, ecx +# CHECK: {evex} cmpgxadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b] - {evex} cmpnlexadd dword ptr [rax + 4*rbx + 123], edx, ecx + {evex} cmpgxadd dword ptr [rax + 4*rbx + 123], edx, ecx -# CHECK: {evex} cmpnlexadd qword ptr [rax + 4*rbx + 123], r15, r9 +# CHECK: {evex} cmpgxadd qword ptr [rax + 4*rbx + 123], r15, r9 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b] - {evex} cmpnlexadd qword ptr [rax + 4*rbx + 123], r15, r9 + {evex} cmpgxadd qword ptr [rax + 4*rbx + 123], r15, r9 -# CHECK: cmpnlexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# CHECK: cmpgxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnlexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d + cmpgxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d -# CHECK: cmpnlexadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# CHECK: cmpgxadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnlexadd qword ptr [r28 + 4*r29 + 291], r23, r19 + cmpgxadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: {evex} cmplexadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b] @@ -128,21 +128,21 @@ # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00] cmplxadd qword ptr [r28 + 4*r29 + 291], r23, r19 -# CHECK: {evex} cmpnzxadd dword ptr [rax + 4*rbx + 123], edx, ecx +# CHECK: {evex} cmpnexadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b] - {evex} cmpnzxadd dword ptr [rax + 4*rbx + 123], edx, ecx + {evex} cmpnexadd dword ptr [rax + 4*rbx + 123], edx, ecx -# CHECK: {evex} cmpnzxadd qword ptr [rax + 4*rbx + 123], r15, r9 +# CHECK: {evex} cmpnexadd qword ptr [rax + 4*rbx + 123], r15, r9 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b] - {evex} cmpnzxadd qword ptr [rax + 4*rbx + 123], r15, r9 + {evex} cmpnexadd qword ptr [rax + 4*rbx + 123], r15, r9 -# CHECK: cmpnzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d +# CHECK: cmpnexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00] - cmpnzxadd dword ptr [r28 + 4*r29 + 291], r22d, r18d + cmpnexadd dword ptr [r28 + 4*r29 + 291], r22d, r18d -# CHECK: cmpnzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 +# CHECK: cmpnexadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00] - cmpnzxadd qword ptr [r28 + 4*r29 + 291], r23, r19 + cmpnexadd qword ptr [r28 + 4*r29 + 291], r23, r19 # CHECK: {evex} cmpnoxadd dword ptr [rax + 4*rbx + 123], edx, ecx # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b] diff --git a/llvm/test/MC/X86/cmpccxadd-att-alias.s b/llvm/test/MC/X86/cmpccxadd-att-alias.s index dcc0f105d7abc1..46c6588740b9cd 100644 --- a/llvm/test/MC/X86/cmpccxadd-att-alias.s +++ b/llvm/test/MC/X86/cmpccxadd-att-alias.s @@ -1,28 +1,28 @@ // RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s -// CHECK: cmpnbxadd %eax, %ecx, (%rip) +// CHECK: cmpaexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpaexadd %eax, %ecx, (%rip) + cmpnbxadd %eax, %ecx, (%rip) -// CHECK: cmpzxadd %eax, %ecx, (%rip) +// CHECK: cmpexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpexadd %eax, %ecx, (%rip) + cmpzxadd %eax, %ecx, (%rip) -// CHECK: cmpnzxadd %eax, %ecx, (%rip) +// CHECK: cmpnexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnexadd %eax, %ecx, (%rip) + cmpnzxadd %eax, %ecx, (%rip) -// CHECK: cmpnbexadd %eax, %ecx, (%rip) +// CHECK: cmpaxadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpaxadd %eax, %ecx, (%rip) + cmpnbexadd %eax, %ecx, (%rip) -// CHECK: cmpnlxadd %eax, %ecx, (%rip) +// CHECK: cmpgexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpgexadd %eax, %ecx, (%rip) + cmpnlxadd %eax, %ecx, (%rip) -// CHECK: cmpnlexadd %eax, %ecx, (%rip) +// CHECK: cmpgxadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpgxadd %eax, %ecx, (%rip) + cmpnlexadd %eax, %ecx, (%rip) // CHECK: cmpbxadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00] @@ -32,7 +32,7 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00] cmpnaexadd %eax, %ecx, (%rip) -// CHECK: cmpnbxadd %eax, %ecx, (%rip) +// CHECK: cmpaexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] cmpncxadd %eax, %ecx, (%rip) diff --git a/llvm/test/MC/X86/cmpccxadd-att.s b/llvm/test/MC/X86/cmpccxadd-att.s index c79cc55a15b81d..a7c9df91ab0c8e 100644 --- a/llvm/test/MC/X86/cmpccxadd-att.s +++ b/llvm/test/MC/X86/cmpccxadd-att.s @@ -196,197 +196,197 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xec,0x8a,0x00,0xfc,0xff,0xff] cmplxadd %r10, %r9, -1024(%rdx) -// CHECK: cmpnbexadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpaxadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbexadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpaxadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpnbexadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpaxadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbexadd %eax, %ecx, 291(%r8,%rax,4) + cmpaxadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpnbexadd %eax, %ecx, (%rip) +// CHECK: cmpaxadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpnbexadd %eax, %ecx, (%rip) + cmpaxadd %eax, %ecx, (%rip) -// CHECK: cmpnbexadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpaxadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnbexadd %eax, %ecx, -128(,%rbp,2) + cmpaxadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpnbexadd %eax, %ecx, 508(%rcx) +// CHECK: cmpaxadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x89,0xfc,0x01,0x00,0x00] - cmpnbexadd %eax, %ecx, 508(%rcx) + cmpaxadd %eax, %ecx, 508(%rcx) -// CHECK: cmpnbexadd %eax, %ecx, -512(%rdx) +// CHECK: cmpaxadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x8a,0x00,0xfe,0xff,0xff] - cmpnbexadd %eax, %ecx, -512(%rdx) + cmpaxadd %eax, %ecx, -512(%rdx) -// CHECK: cmpnbexadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpaxadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbexadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpaxadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpnbexadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpaxadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbexadd %r10, %r9, 291(%r8,%rax,4) + cmpaxadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpnbexadd %r10, %r9, (%rip) +// CHECK: cmpaxadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpnbexadd %r10, %r9, (%rip) + cmpaxadd %r10, %r9, (%rip) -// CHECK: cmpnbexadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpaxadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnbexadd %r10, %r9, -256(,%rbp,2) + cmpaxadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpnbexadd %r10, %r9, 1016(%rcx) +// CHECK: cmpaxadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x89,0xf8,0x03,0x00,0x00] - cmpnbexadd %r10, %r9, 1016(%rcx) + cmpaxadd %r10, %r9, 1016(%rcx) -// CHECK: cmpnbexadd %r10, %r9, -1024(%rdx) +// CHECK: cmpaxadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x8a,0x00,0xfc,0xff,0xff] - cmpnbexadd %r10, %r9, -1024(%rdx) + cmpaxadd %r10, %r9, -1024(%rdx) -// CHECK: cmpnbxadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpaexadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbxadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpaexadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpnbxadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpaexadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbxadd %eax, %ecx, 291(%r8,%rax,4) + cmpaexadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpnbxadd %eax, %ecx, (%rip) +// CHECK: cmpaexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpnbxadd %eax, %ecx, (%rip) + cmpaexadd %eax, %ecx, (%rip) -// CHECK: cmpnbxadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpaexadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnbxadd %eax, %ecx, -128(,%rbp,2) + cmpaexadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpnbxadd %eax, %ecx, 508(%rcx) +// CHECK: cmpaexadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x89,0xfc,0x01,0x00,0x00] - cmpnbxadd %eax, %ecx, 508(%rcx) + cmpaexadd %eax, %ecx, 508(%rcx) -// CHECK: cmpnbxadd %eax, %ecx, -512(%rdx) +// CHECK: cmpaexadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x8a,0x00,0xfe,0xff,0xff] - cmpnbxadd %eax, %ecx, -512(%rdx) + cmpaexadd %eax, %ecx, -512(%rdx) -// CHECK: cmpnbxadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpaexadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbxadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpaexadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpnbxadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpaexadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbxadd %r10, %r9, 291(%r8,%rax,4) + cmpaexadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpnbxadd %r10, %r9, (%rip) +// CHECK: cmpaexadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpnbxadd %r10, %r9, (%rip) + cmpaexadd %r10, %r9, (%rip) -// CHECK: cmpnbxadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpaexadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnbxadd %r10, %r9, -256(,%rbp,2) + cmpaexadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpnbxadd %r10, %r9, 1016(%rcx) +// CHECK: cmpaexadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x89,0xf8,0x03,0x00,0x00] - cmpnbxadd %r10, %r9, 1016(%rcx) + cmpaexadd %r10, %r9, 1016(%rcx) -// CHECK: cmpnbxadd %r10, %r9, -1024(%rdx) +// CHECK: cmpaexadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x8a,0x00,0xfc,0xff,0xff] - cmpnbxadd %r10, %r9, -1024(%rdx) + cmpaexadd %r10, %r9, -1024(%rdx) -// CHECK: cmpnlexadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpgxadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlexadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpgxadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpnlexadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpgxadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xef,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlexadd %eax, %ecx, 291(%r8,%rax,4) + cmpgxadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpnlexadd %eax, %ecx, (%rip) +// CHECK: cmpgxadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpnlexadd %eax, %ecx, (%rip) + cmpgxadd %eax, %ecx, (%rip) -// CHECK: cmpnlexadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpgxadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnlexadd %eax, %ecx, -128(,%rbp,2) + cmpgxadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpnlexadd %eax, %ecx, 508(%rcx) +// CHECK: cmpgxadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x89,0xfc,0x01,0x00,0x00] - cmpnlexadd %eax, %ecx, 508(%rcx) + cmpgxadd %eax, %ecx, 508(%rcx) -// CHECK: cmpnlexadd %eax, %ecx, -512(%rdx) +// CHECK: cmpgxadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x8a,0x00,0xfe,0xff,0xff] - cmpnlexadd %eax, %ecx, -512(%rdx) + cmpgxadd %eax, %ecx, -512(%rdx) -// CHECK: cmpnlexadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpgxadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlexadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpgxadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpnlexadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpgxadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xef,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlexadd %r10, %r9, 291(%r8,%rax,4) + cmpgxadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpnlexadd %r10, %r9, (%rip) +// CHECK: cmpgxadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpnlexadd %r10, %r9, (%rip) + cmpgxadd %r10, %r9, (%rip) -// CHECK: cmpnlexadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpgxadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnlexadd %r10, %r9, -256(,%rbp,2) + cmpgxadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpnlexadd %r10, %r9, 1016(%rcx) +// CHECK: cmpgxadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x89,0xf8,0x03,0x00,0x00] - cmpnlexadd %r10, %r9, 1016(%rcx) + cmpgxadd %r10, %r9, 1016(%rcx) -// CHECK: cmpnlexadd %r10, %r9, -1024(%rdx) +// CHECK: cmpgxadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x8a,0x00,0xfc,0xff,0xff] - cmpnlexadd %r10, %r9, -1024(%rdx) + cmpgxadd %r10, %r9, -1024(%rdx) -// CHECK: cmpnlxadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpgexadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlxadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpgexadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpnlxadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpgexadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xed,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlxadd %eax, %ecx, 291(%r8,%rax,4) + cmpgexadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpnlxadd %eax, %ecx, (%rip) +// CHECK: cmpgexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpnlxadd %eax, %ecx, (%rip) + cmpgexadd %eax, %ecx, (%rip) -// CHECK: cmpnlxadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpgexadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnlxadd %eax, %ecx, -128(,%rbp,2) + cmpgexadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpnlxadd %eax, %ecx, 508(%rcx) +// CHECK: cmpgexadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x89,0xfc,0x01,0x00,0x00] - cmpnlxadd %eax, %ecx, 508(%rcx) + cmpgexadd %eax, %ecx, 508(%rcx) -// CHECK: cmpnlxadd %eax, %ecx, -512(%rdx) +// CHECK: cmpgexadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x8a,0x00,0xfe,0xff,0xff] - cmpnlxadd %eax, %ecx, -512(%rdx) + cmpgexadd %eax, %ecx, -512(%rdx) -// CHECK: cmpnlxadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpgexadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlxadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpgexadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpnlxadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpgexadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xed,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlxadd %r10, %r9, 291(%r8,%rax,4) + cmpgexadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpnlxadd %r10, %r9, (%rip) +// CHECK: cmpgexadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpnlxadd %r10, %r9, (%rip) + cmpgexadd %r10, %r9, (%rip) -// CHECK: cmpnlxadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpgexadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnlxadd %r10, %r9, -256(,%rbp,2) + cmpgexadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpnlxadd %r10, %r9, 1016(%rcx) +// CHECK: cmpgexadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x89,0xf8,0x03,0x00,0x00] - cmpnlxadd %r10, %r9, 1016(%rcx) + cmpgexadd %r10, %r9, 1016(%rcx) -// CHECK: cmpnlxadd %r10, %r9, -1024(%rdx) +// CHECK: cmpgexadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x8a,0x00,0xfc,0xff,0xff] - cmpnlxadd %r10, %r9, -1024(%rdx) + cmpgexadd %r10, %r9, -1024(%rdx) // CHECK: cmpnoxadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe1,0x8c,0xf5,0x00,0x00,0x00,0x10] @@ -532,53 +532,53 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xe9,0x8a,0x00,0xfc,0xff,0xff] cmpnsxadd %r10, %r9, -1024(%rdx) -// CHECK: cmpnzxadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpnexadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnzxadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpnexadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpnzxadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpnexadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnzxadd %eax, %ecx, 291(%r8,%rax,4) + cmpnexadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpnzxadd %eax, %ecx, (%rip) +// CHECK: cmpnexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnzxadd %eax, %ecx, (%rip) + cmpnexadd %eax, %ecx, (%rip) -// CHECK: cmpnzxadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpnexadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnzxadd %eax, %ecx, -128(,%rbp,2) + cmpnexadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpnzxadd %eax, %ecx, 508(%rcx) +// CHECK: cmpnexadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x89,0xfc,0x01,0x00,0x00] - cmpnzxadd %eax, %ecx, 508(%rcx) + cmpnexadd %eax, %ecx, 508(%rcx) -// CHECK: cmpnzxadd %eax, %ecx, -512(%rdx) +// CHECK: cmpnexadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x8a,0x00,0xfe,0xff,0xff] - cmpnzxadd %eax, %ecx, -512(%rdx) + cmpnexadd %eax, %ecx, -512(%rdx) -// CHECK: cmpnzxadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpnexadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnzxadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpnexadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpnzxadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpnexadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnzxadd %r10, %r9, 291(%r8,%rax,4) + cmpnexadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpnzxadd %r10, %r9, (%rip) +// CHECK: cmpnexadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnzxadd %r10, %r9, (%rip) + cmpnexadd %r10, %r9, (%rip) -// CHECK: cmpnzxadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpnexadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnzxadd %r10, %r9, -256(,%rbp,2) + cmpnexadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpnzxadd %r10, %r9, 1016(%rcx) +// CHECK: cmpnexadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x89,0xf8,0x03,0x00,0x00] - cmpnzxadd %r10, %r9, 1016(%rcx) + cmpnexadd %r10, %r9, 1016(%rcx) -// CHECK: cmpnzxadd %r10, %r9, -1024(%rdx) +// CHECK: cmpnexadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x8a,0x00,0xfc,0xff,0xff] - cmpnzxadd %r10, %r9, -1024(%rdx) + cmpnexadd %r10, %r9, -1024(%rdx) // CHECK: cmpoxadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe0,0x8c,0xf5,0x00,0x00,0x00,0x10] @@ -724,53 +724,53 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xe8,0x8a,0x00,0xfc,0xff,0xff] cmpsxadd %r10, %r9, -1024(%rdx) -// CHECK: cmpzxadd %eax, %ecx, 268435456(%rbp,%r14,8) +// CHECK: cmpexadd %eax, %ecx, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0xa2,0x79,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpzxadd %eax, %ecx, 268435456(%rbp,%r14,8) + cmpexadd %eax, %ecx, 268435456(%rbp,%r14,8) -// CHECK: cmpzxadd %eax, %ecx, 291(%r8,%rax,4) +// CHECK: cmpexadd %eax, %ecx, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0xc2,0x79,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpzxadd %eax, %ecx, 291(%r8,%rax,4) + cmpexadd %eax, %ecx, 291(%r8,%rax,4) -// CHECK: cmpzxadd %eax, %ecx, (%rip) +// CHECK: cmpexadd %eax, %ecx, (%rip) // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpzxadd %eax, %ecx, (%rip) + cmpexadd %eax, %ecx, (%rip) -// CHECK: cmpzxadd %eax, %ecx, -128(,%rbp,2) +// CHECK: cmpexadd %eax, %ecx, -128(,%rbp,2) // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpzxadd %eax, %ecx, -128(,%rbp,2) + cmpexadd %eax, %ecx, -128(,%rbp,2) -// CHECK: cmpzxadd %eax, %ecx, 508(%rcx) +// CHECK: cmpexadd %eax, %ecx, 508(%rcx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x89,0xfc,0x01,0x00,0x00] - cmpzxadd %eax, %ecx, 508(%rcx) + cmpexadd %eax, %ecx, 508(%rcx) -// CHECK: cmpzxadd %eax, %ecx, -512(%rdx) +// CHECK: cmpexadd %eax, %ecx, -512(%rdx) // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x8a,0x00,0xfe,0xff,0xff] - cmpzxadd %eax, %ecx, -512(%rdx) + cmpexadd %eax, %ecx, -512(%rdx) -// CHECK: cmpzxadd %r10, %r9, 268435456(%rbp,%r14,8) +// CHECK: cmpexadd %r10, %r9, 268435456(%rbp,%r14,8) // CHECK: encoding: [0xc4,0x22,0xa9,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpzxadd %r10, %r9, 268435456(%rbp,%r14,8) + cmpexadd %r10, %r9, 268435456(%rbp,%r14,8) -// CHECK: cmpzxadd %r10, %r9, 291(%r8,%rax,4) +// CHECK: cmpexadd %r10, %r9, 291(%r8,%rax,4) // CHECK: encoding: [0xc4,0x42,0xa9,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpzxadd %r10, %r9, 291(%r8,%rax,4) + cmpexadd %r10, %r9, 291(%r8,%rax,4) -// CHECK: cmpzxadd %r10, %r9, (%rip) +// CHECK: cmpexadd %r10, %r9, (%rip) // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpzxadd %r10, %r9, (%rip) + cmpexadd %r10, %r9, (%rip) -// CHECK: cmpzxadd %r10, %r9, -256(,%rbp,2) +// CHECK: cmpexadd %r10, %r9, -256(,%rbp,2) // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpzxadd %r10, %r9, -256(,%rbp,2) + cmpexadd %r10, %r9, -256(,%rbp,2) -// CHECK: cmpzxadd %r10, %r9, 1016(%rcx) +// CHECK: cmpexadd %r10, %r9, 1016(%rcx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x89,0xf8,0x03,0x00,0x00] - cmpzxadd %r10, %r9, 1016(%rcx) + cmpexadd %r10, %r9, 1016(%rcx) -// CHECK: cmpzxadd %r10, %r9, -1024(%rdx) +// CHECK: cmpexadd %r10, %r9, -1024(%rdx) // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x8a,0x00,0xfc,0xff,0xff] - cmpzxadd %r10, %r9, -1024(%rdx) + cmpexadd %r10, %r9, -1024(%rdx) // CHECK: cmpbexadd %ecx, %r8d, (%rip) // CHECK: encoding: [0xc4,0x62,0x71,0xe6,0x05,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/X86/cmpccxadd-intel-alias.s b/llvm/test/MC/X86/cmpccxadd-intel-alias.s index f5c7a6b6a2e0a5..6228d7fc67231d 100644 --- a/llvm/test/MC/X86/cmpccxadd-intel-alias.s +++ b/llvm/test/MC/X86/cmpccxadd-intel-alias.s @@ -1,28 +1,28 @@ // RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -// CHECK: cmpnbxadd dword ptr [rip], ecx, eax +// CHECK: cmpaexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpaexadd dword ptr [rip], ecx, eax + cmpnbxadd dword ptr [rip], ecx, eax -// CHECK: cmpzxadd dword ptr [rip], ecx, eax +// CHECK: cmpexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpexadd dword ptr [rip], ecx, eax + cmpzxadd dword ptr [rip], ecx, eax -// CHECK: cmpnzxadd dword ptr [rip], ecx, eax +// CHECK: cmpnexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnexadd dword ptr [rip], ecx, eax + cmpnzxadd dword ptr [rip], ecx, eax -// CHECK: cmpnbexadd dword ptr [rip], ecx, eax +// CHECK: cmpaxadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpaxadd dword ptr [rip], ecx, eax + cmpnbexadd dword ptr [rip], ecx, eax -// CHECK: cmpnlxadd dword ptr [rip], ecx, eax +// CHECK: cmpgexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpgexadd dword ptr [rip], ecx, eax + cmpnlxadd dword ptr [rip], ecx, eax -// CHECK: cmpnlexadd dword ptr [rip], ecx, eax +// CHECK: cmpgxadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpgxadd dword ptr [rip], ecx, eax + cmpnlexadd dword ptr [rip], ecx, eax // CHECK: cmpbxadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00] @@ -32,7 +32,7 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00] cmpnaexadd dword ptr [rip], ecx, eax -// CHECK: cmpnbxadd dword ptr [rip], ecx, eax +// CHECK: cmpaexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] cmpncxadd dword ptr [rip], ecx, eax diff --git a/llvm/test/MC/X86/cmpccxadd-intel.s b/llvm/test/MC/X86/cmpccxadd-intel.s index c03873e34decea..af7c6c3b61c949 100644 --- a/llvm/test/MC/X86/cmpccxadd-intel.s +++ b/llvm/test/MC/X86/cmpccxadd-intel.s @@ -192,197 +192,197 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xec,0x8a,0x00,0xfc,0xff,0xff] cmplxadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpnbexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpaxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpaxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpnbexadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpaxadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbexadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpaxadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpnbexadd dword ptr [rip], ecx, eax +// CHECK: cmpaxadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpnbexadd dword ptr [rip], ecx, eax + cmpaxadd dword ptr [rip], ecx, eax -// CHECK: cmpnbexadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpaxadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnbexadd dword ptr [2*rbp - 128], ecx, eax + cmpaxadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpnbexadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpaxadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x89,0xfc,0x01,0x00,0x00] - cmpnbexadd dword ptr [rcx + 508], ecx, eax + cmpaxadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpnbexadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpaxadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe7,0x8a,0x00,0xfe,0xff,0xff] - cmpnbexadd dword ptr [rdx - 512], ecx, eax + cmpaxadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpnbexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpaxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xe7,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpaxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpnbexadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpaxadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xe7,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbexadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpaxadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpnbexadd qword ptr [rip], r9, r10 +// CHECK: cmpaxadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x0d,0x00,0x00,0x00,0x00] - cmpnbexadd qword ptr [rip], r9, r10 + cmpaxadd qword ptr [rip], r9, r10 -// CHECK: cmpnbexadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpaxadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnbexadd qword ptr [2*rbp - 256], r9, r10 + cmpaxadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpnbexadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpaxadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x89,0xf8,0x03,0x00,0x00] - cmpnbexadd qword ptr [rcx + 1016], r9, r10 + cmpaxadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpnbexadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpaxadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe7,0x8a,0x00,0xfc,0xff,0xff] - cmpnbexadd qword ptr [rdx - 1024], r9, r10 + cmpaxadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpnbxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpaexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpaexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpnbxadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpaexadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbxadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpaexadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpnbxadd dword ptr [rip], ecx, eax +// CHECK: cmpaexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpnbxadd dword ptr [rip], ecx, eax + cmpaexadd dword ptr [rip], ecx, eax -// CHECK: cmpnbxadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpaexadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnbxadd dword ptr [2*rbp - 128], ecx, eax + cmpaexadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpnbxadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpaexadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x89,0xfc,0x01,0x00,0x00] - cmpnbxadd dword ptr [rcx + 508], ecx, eax + cmpaexadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpnbxadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpaexadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x8a,0x00,0xfe,0xff,0xff] - cmpnbxadd dword ptr [rdx - 512], ecx, eax + cmpaexadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpnbxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpaexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xe3,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnbxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpaexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpnbxadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpaexadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xe3,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnbxadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpaexadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpnbxadd qword ptr [rip], r9, r10 +// CHECK: cmpaexadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x0d,0x00,0x00,0x00,0x00] - cmpnbxadd qword ptr [rip], r9, r10 + cmpaexadd qword ptr [rip], r9, r10 -// CHECK: cmpnbxadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpaexadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnbxadd qword ptr [2*rbp - 256], r9, r10 + cmpaexadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpnbxadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpaexadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x89,0xf8,0x03,0x00,0x00] - cmpnbxadd qword ptr [rcx + 1016], r9, r10 + cmpaexadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpnbxadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpaexadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe3,0x8a,0x00,0xfc,0xff,0xff] - cmpnbxadd qword ptr [rdx - 1024], r9, r10 + cmpaexadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpnlexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpgxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpgxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpnlexadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpgxadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xef,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlexadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpgxadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpnlexadd dword ptr [rip], ecx, eax +// CHECK: cmpgxadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpnlexadd dword ptr [rip], ecx, eax + cmpgxadd dword ptr [rip], ecx, eax -// CHECK: cmpnlexadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpgxadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnlexadd dword ptr [2*rbp - 128], ecx, eax + cmpgxadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpnlexadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpgxadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x89,0xfc,0x01,0x00,0x00] - cmpnlexadd dword ptr [rcx + 508], ecx, eax + cmpgxadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpnlexadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpgxadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x8a,0x00,0xfe,0xff,0xff] - cmpnlexadd dword ptr [rdx - 512], ecx, eax + cmpgxadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpnlexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpgxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xef,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpgxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpnlexadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpgxadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xef,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlexadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpgxadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpnlexadd qword ptr [rip], r9, r10 +// CHECK: cmpgxadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x0d,0x00,0x00,0x00,0x00] - cmpnlexadd qword ptr [rip], r9, r10 + cmpgxadd qword ptr [rip], r9, r10 -// CHECK: cmpnlexadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpgxadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnlexadd qword ptr [2*rbp - 256], r9, r10 + cmpgxadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpnlexadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpgxadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x89,0xf8,0x03,0x00,0x00] - cmpnlexadd qword ptr [rcx + 1016], r9, r10 + cmpgxadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpnlexadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpgxadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xef,0x8a,0x00,0xfc,0xff,0xff] - cmpnlexadd qword ptr [rdx - 1024], r9, r10 + cmpgxadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpnlxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpgexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpgexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpnlxadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpgexadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xed,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlxadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpgexadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpnlxadd dword ptr [rip], ecx, eax +// CHECK: cmpgexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpnlxadd dword ptr [rip], ecx, eax + cmpgexadd dword ptr [rip], ecx, eax -// CHECK: cmpnlxadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpgexadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnlxadd dword ptr [2*rbp - 128], ecx, eax + cmpgexadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpnlxadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpgexadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x89,0xfc,0x01,0x00,0x00] - cmpnlxadd dword ptr [rcx + 508], ecx, eax + cmpgexadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpnlxadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpgexadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xed,0x8a,0x00,0xfe,0xff,0xff] - cmpnlxadd dword ptr [rdx - 512], ecx, eax + cmpgexadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpnlxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpgexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xed,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnlxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpgexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpnlxadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpgexadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xed,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnlxadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpgexadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpnlxadd qword ptr [rip], r9, r10 +// CHECK: cmpgexadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x0d,0x00,0x00,0x00,0x00] - cmpnlxadd qword ptr [rip], r9, r10 + cmpgexadd qword ptr [rip], r9, r10 -// CHECK: cmpnlxadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpgexadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnlxadd qword ptr [2*rbp - 256], r9, r10 + cmpgexadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpnlxadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpgexadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x89,0xf8,0x03,0x00,0x00] - cmpnlxadd qword ptr [rcx + 1016], r9, r10 + cmpgexadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpnlxadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpgexadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xed,0x8a,0x00,0xfc,0xff,0xff] - cmpnlxadd qword ptr [rdx - 1024], r9, r10 + cmpgexadd qword ptr [rdx - 1024], r9, r10 // CHECK: cmpnoxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe1,0x8c,0xf5,0x00,0x00,0x00,0x10] @@ -528,53 +528,53 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xe9,0x8a,0x00,0xfc,0xff,0xff] cmpnsxadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpnzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpnexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpnexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpnzxadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpnexadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnzxadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpnexadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpnzxadd dword ptr [rip], ecx, eax +// CHECK: cmpnexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnzxadd dword ptr [rip], ecx, eax + cmpnexadd dword ptr [rip], ecx, eax -// CHECK: cmpnzxadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpnexadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpnzxadd dword ptr [2*rbp - 128], ecx, eax + cmpnexadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpnzxadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpnexadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x89,0xfc,0x01,0x00,0x00] - cmpnzxadd dword ptr [rcx + 508], ecx, eax + cmpnexadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpnzxadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpnexadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe5,0x8a,0x00,0xfe,0xff,0xff] - cmpnzxadd dword ptr [rdx - 512], ecx, eax + cmpnexadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpnzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpnexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xe5,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpnzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpnexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpnzxadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpnexadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xe5,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpnzxadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpnexadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpnzxadd qword ptr [rip], r9, r10 +// CHECK: cmpnexadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x0d,0x00,0x00,0x00,0x00] - cmpnzxadd qword ptr [rip], r9, r10 + cmpnexadd qword ptr [rip], r9, r10 -// CHECK: cmpnzxadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpnexadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpnzxadd qword ptr [2*rbp - 256], r9, r10 + cmpnexadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpnzxadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpnexadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x89,0xf8,0x03,0x00,0x00] - cmpnzxadd qword ptr [rcx + 1016], r9, r10 + cmpnexadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpnzxadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpnexadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe5,0x8a,0x00,0xfc,0xff,0xff] - cmpnzxadd qword ptr [rdx - 1024], r9, r10 + cmpnexadd qword ptr [rdx - 1024], r9, r10 // CHECK: cmpoxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe0,0x8c,0xf5,0x00,0x00,0x00,0x10] @@ -720,53 +720,53 @@ // CHECK: encoding: [0xc4,0x62,0xa9,0xe8,0x8a,0x00,0xfc,0xff,0xff] cmpsxadd qword ptr [rdx - 1024], r9, r10 -// CHECK: cmpzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax +// CHECK: cmpexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax // CHECK: encoding: [0xc4,0xa2,0x79,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpzxadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax + cmpexadd dword ptr [rbp + 8*r14 + 268435456], ecx, eax -// CHECK: cmpzxadd dword ptr [r8 + 4*rax + 291], ecx, eax +// CHECK: cmpexadd dword ptr [r8 + 4*rax + 291], ecx, eax // CHECK: encoding: [0xc4,0xc2,0x79,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpzxadd dword ptr [r8 + 4*rax + 291], ecx, eax + cmpexadd dword ptr [r8 + 4*rax + 291], ecx, eax -// CHECK: cmpzxadd dword ptr [rip], ecx, eax +// CHECK: cmpexadd dword ptr [rip], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpzxadd dword ptr [rip], ecx, eax + cmpexadd dword ptr [rip], ecx, eax -// CHECK: cmpzxadd dword ptr [2*rbp - 128], ecx, eax +// CHECK: cmpexadd dword ptr [2*rbp - 128], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x0c,0x6d,0x80,0xff,0xff,0xff] - cmpzxadd dword ptr [2*rbp - 128], ecx, eax + cmpexadd dword ptr [2*rbp - 128], ecx, eax -// CHECK: cmpzxadd dword ptr [rcx + 508], ecx, eax +// CHECK: cmpexadd dword ptr [rcx + 508], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x89,0xfc,0x01,0x00,0x00] - cmpzxadd dword ptr [rcx + 508], ecx, eax + cmpexadd dword ptr [rcx + 508], ecx, eax -// CHECK: cmpzxadd dword ptr [rdx - 512], ecx, eax +// CHECK: cmpexadd dword ptr [rdx - 512], ecx, eax // CHECK: encoding: [0xc4,0xe2,0x79,0xe4,0x8a,0x00,0xfe,0xff,0xff] - cmpzxadd dword ptr [rdx - 512], ecx, eax + cmpexadd dword ptr [rdx - 512], ecx, eax -// CHECK: cmpzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 +// CHECK: cmpexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 // CHECK: encoding: [0xc4,0x22,0xa9,0xe4,0x8c,0xf5,0x00,0x00,0x00,0x10] - cmpzxadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 + cmpexadd qword ptr [rbp + 8*r14 + 268435456], r9, r10 -// CHECK: cmpzxadd qword ptr [r8 + 4*rax + 291], r9, r10 +// CHECK: cmpexadd qword ptr [r8 + 4*rax + 291], r9, r10 // CHECK: encoding: [0xc4,0x42,0xa9,0xe4,0x8c,0x80,0x23,0x01,0x00,0x00] - cmpzxadd qword ptr [r8 + 4*rax + 291], r9, r10 + cmpexadd qword ptr [r8 + 4*rax + 291], r9, r10 -// CHECK: cmpzxadd qword ptr [rip], r9, r10 +// CHECK: cmpexadd qword ptr [rip], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x0d,0x00,0x00,0x00,0x00] - cmpzxadd qword ptr [rip], r9, r10 + cmpexadd qword ptr [rip], r9, r10 -// CHECK: cmpzxadd qword ptr [2*rbp - 256], r9, r10 +// CHECK: cmpexadd qword ptr [2*rbp - 256], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x0c,0x6d,0x00,0xff,0xff,0xff] - cmpzxadd qword ptr [2*rbp - 256], r9, r10 + cmpexadd qword ptr [2*rbp - 256], r9, r10 -// CHECK: cmpzxadd qword ptr [rcx + 1016], r9, r10 +// CHECK: cmpexadd qword ptr [rcx + 1016], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x89,0xf8,0x03,0x00,0x00] - cmpzxadd qword ptr [rcx + 1016], r9, r10 + cmpexadd qword ptr [rcx + 1016], r9, r10 -// CHECK: cmpzxadd qword ptr [rdx - 1024], r9, r10 +// CHECK: cmpexadd qword ptr [rdx - 1024], r9, r10 // CHECK: encoding: [0xc4,0x62,0xa9,0xe4,0x8a,0x00,0xfc,0xff,0xff] - cmpzxadd qword ptr [rdx - 1024], r9, r10 + cmpexadd qword ptr [rdx - 1024], r9, r10 // CHECK: cmpbexadd dword ptr [rip], r8d, ecx // CHECK: encoding: [0xc4,0x62,0x71,0xe6,0x05,0x00,0x00,0x00,0x00] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll index a8b54ac33d9042..e8b4e752d3a28c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll @@ -294,4 +294,59 @@ define float @no_unsafe(ptr %addr, float %val) { ret float %res } +define float @flat_atomicrmw_fadd_f32__align32(ptr %addr, float %val) { +; GFX908-LABEL: @flat_atomicrmw_fadd_f32__align32( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 32 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @flat_atomicrmw_fadd_f32__align32( +; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] +; GFX90A: atomicrmw.shared: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(3) +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL:%.*]] seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.check.private: +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[ADDR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 32 +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]] +; GFX90A-NEXT: store float [[NEW]], ptr addrspace(5) [[TMP3]], align 32 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[RES]] +; +; GFX940-LABEL: @flat_atomicrmw_fadd_f32__align32( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX1100-LABEL: @flat_atomicrmw_fadd_f32__align32( +; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX1100-NEXT: ret float [[RES]] +; + %res = atomicrmw fadd ptr %addr, float %val seq_cst, align 32, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + !0 = !{} diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll index baa6f3d51a40ef..2ceb44b89eb9e9 100644 --- a/llvm/test/Transforms/InstCombine/icmp-add.ll +++ b/llvm/test/Transforms/InstCombine/icmp-add.ll @@ -3102,3 +3102,84 @@ define i1 @uge_add_C2_pow2_C_neg(i8 %x) { } declare void @llvm.assume(i1) + +; Change an unsigned predicate to signed in icmp (add x, C1), C2 +define i1 @icmp_add_constant_with_constant_ult_to_slt(i32 range(i32 -4, 10) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ult_to_slt( +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 8 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add nsw i32 %x, 5 + %cmp = icmp ult i32 %add, 13 + ret i1 %cmp +} + +define i1 @icmp_add_constant_with_constant_ugt_to_sgt(i32 range(i32 -4, 10) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ugt_to_sgt( +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], 2 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add nsw i32 %x, 10 + %cmp = icmp ugt i32 %add, 12 + ret i1 %cmp +} + +; Negative test: x + C1 may be negative +define i1 @icmp_add_constant_with_constant_ult_to_slt_neg1(i32 range(i32 -5, 10) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ult_to_slt_neg1( +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[X:%.*]], 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], 20 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add nsw i32 %x, 4 + %cmp = icmp ult i32 %add, 20 + ret i1 %cmp +} + +; Negative test: missing nsw flag +define i1 @icmp_add_constant_with_constant_ult_to_slt_neg2(i8 range(i8 -4, 120) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ult_to_slt_neg2( +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[X:%.*]], 15 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[ADD]], 20 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add i8 %x, 15 + %cmp = icmp ult i8 %add, 20 + ret i1 %cmp +} + +; Negative test: C2 is negative +define i1 @icmp_add_constant_with_constant_ult_to_slt_neg3(i32 range(i32 -4, 10) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ult_to_slt_neg3( +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[X:%.*]], 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], -6 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add nsw i32 %x, 4 + %cmp = icmp ult i32 %add, -6 + ret i1 %cmp +} + +; Negative test: C2 - C1 is negative +define i1 @icmp_add_constant_with_constant_ult_to_slt_neg4(i32 range(i32 -4, 10) %x) { +; CHECK-LABEL: @icmp_add_constant_with_constant_ult_to_slt_neg4( +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[X:%.*]], 5 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], 2 +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = add nsw i32 %x, 5 + %cmp = icmp ult i32 %add, 2 + ret i1 %cmp +} + +; Same as before, but infer the range of ucmp +define i1 @icmp_of_ucmp_plus_const_with_const(i32 %x, i32 %y) { +; CHECK-LABEL: @icmp_of_ucmp_plus_const_with_const( +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP2]] +; + %cmp1 = call i8 @llvm.ucmp(i32 %x, i32 %y) + %add = add i8 %cmp1, 1 + %cmp2 = icmp ult i8 %add, 2 + ret i1 %cmp2 +} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 61ed955ea13e4e..6fc52ab3f26e03 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -1359,7 +1359,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 10 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll index 55bbf54d1f39d7..35ece2fe6eacd0 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt %s -passes=loop-vectorize -hoist-runtime-checks=false -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -320,3 +319,61 @@ inner.exit: outer.exit: ret void } + +define void @use_diff_checks_when_retrying_with_rt_checks(i64 %off, ptr %dst, ptr %src) { +; CHECK-LABEL: @use_diff_checks_when_retrying_with_rt_checks( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.memcheck +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 %off, 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 8000 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr %dst, i64 8000 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr %src, i64 8000 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr %src, i64 8 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr %src, i64 8008 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr %dst, [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr %src, [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult ptr [[SCEVGEP4]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX12:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: [[BOUND013:%.*]] = icmp ult ptr %dst, [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND114:%.*]] = icmp ult ptr %src, [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]] +; CHECK-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX12]], [[FOUND_CONFLICT15]] +; CHECK-NEXT: [[BOUND017:%.*]] = icmp ult ptr %dst, [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND118:%.*]] = icmp ult ptr [[SCEVGEP4]], [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]] +; CHECK-NEXT: [[CONFLICT_RDX20:%.*]] = or i1 [[CONFLICT_RDX16]], [[FOUND_CONFLICT19]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX20]], label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: br label %vector.body +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.off = add i64 %off, %iv + %gep.src = getelementptr i64, ptr %src, i64 %iv + %l.0 = load i64, ptr %gep.src, align 8 + %gep.dst.off = getelementptr i64, ptr %dst, i64 %iv.off + store i64 %l.0, ptr %gep.dst.off, align 8 + %gep.src.8 = getelementptr i8, ptr %gep.src, i64 8 + %l.1 = load i64, ptr %gep.src.8, align 8 + %gep.dst.iv = getelementptr i64, ptr %dst, i64 %iv + store i64 %l.1, ptr %gep.dst.iv, align 8 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll index 56c7c7519f6945..a70f94e1521f0d 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 ; RUN: opt -passes=pgo-instr-gen -profile-context-root=an_entrypoint \ ; RUN: -S < %s | FileCheck --check-prefix=INSTRUMENT %s -; RUN: opt -passes=pgo-instr-gen,ctx-instr-lower -profile-context-root=an_entrypoint \ +; RUN: opt -passes=pgo-instr-gen,assign-guid,ctx-instr-lower -profile-context-root=an_entrypoint \ ; RUN: -profile-context-root=another_entrypoint_no_callees \ ; RUN: -S < %s | FileCheck --check-prefix=LOWERING %s @@ -46,7 +46,7 @@ define void @foo(i32 %a, ptr %fct) { ; INSTRUMENT-NEXT: ret void ; ; LOWERING-LABEL: define void @foo( -; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) { +; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) !guid [[META0:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 @@ -104,7 +104,7 @@ define void @an_entrypoint(i32 %a) { ; INSTRUMENT-NEXT: ret void ; ; LOWERING-LABEL: define void @an_entrypoint( -; LOWERING-SAME: i32 [[A:%.*]]) { +; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META1:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @an_entrypoint_ctx_root, i64 4909520559318251808, i32 2, i32 1) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 @@ -154,7 +154,7 @@ define void @another_entrypoint_no_callees(i32 %a) { ; INSTRUMENT-NEXT: ret void ; ; LOWERING-LABEL: define void @another_entrypoint_no_callees( -; LOWERING-SAME: i32 [[A:%.*]]) { +; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META2:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @another_entrypoint_no_callees_ctx_root, i64 -6371873725078000974, i32 2, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 @@ -188,7 +188,7 @@ define void @simple(i32 %a) { ; INSTRUMENT-NEXT: ret void ; ; LOWERING-LABEL: define void @simple( -; LOWERING-SAME: i32 [[A:%.*]]) { +; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META3:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 1, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 @@ -212,7 +212,7 @@ define i32 @no_callsites(i32 %a) { ; INSTRUMENT-NEXT: ret i32 0 ; ; LOWERING-LABEL: define i32 @no_callsites( -; LOWERING-SAME: i32 [[A:%.*]]) { +; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META4:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 @@ -243,7 +243,8 @@ define void @no_counters() { ; INSTRUMENT-NEXT: call void @bar() ; INSTRUMENT-NEXT: ret void ; -; LOWERING-LABEL: define void @no_counters() { +; LOWERING-LABEL: define void @no_counters( +; LOWERING-SAME: ) !guid [[META5:![0-9]+]] { ; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_counters, i64 5458232184388660970, i32 1, i32 1) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 @@ -263,8 +264,15 @@ define void @no_counters() { ret void } ;. -; INSTRUMENT: attributes #[[ATTR0:[0-9]+]] = { nounwind } -;. ; LOWERING: attributes #[[ATTR0:[0-9]+]] = { nounwind } ; LOWERING: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. +; INSTRUMENT: attributes #[[ATTR0:[0-9]+]] = { nounwind } +;. +; LOWERING: [[META0]] = !{i64 6699318081062747564} +; LOWERING: [[META1]] = !{i64 4909520559318251808} +; LOWERING: [[META2]] = !{i64 -6371873725078000974} +; LOWERING: [[META3]] = !{i64 -3006003237940970099} +; LOWERING: [[META4]] = !{i64 5679753335911435902} +; LOWERING: [[META5]] = !{i64 5458232184388660970} +;. diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll index 18ac2f92aa39d4..cb8ab78dc0f414 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll @@ -12,7 +12,7 @@ declare void @bar() ;. define void @foo(i32 %a, ptr %fct) { ; CHECK-LABEL: define void @foo( -; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr { +; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr !guid [[META0:![0-9]+]] { ; CHECK-NEXT: call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0) ; CHECK-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0 ; CHECK-NEXT: br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]] @@ -42,3 +42,5 @@ exit: ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } ;. +; CHECK: [[META0]] = !{i64 6699318081062747564} +;. diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index 9b37b49b3d49d2..cff5019f8e6ee4 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -32,15 +32,21 @@ # RUN: FileCheck %s --check-prefix=AVAIL --input-file %t3.txt # RUN: FileCheck %s --check-prefix=UNAVAIL --input-file %t3.txt # -# CHECK: << Total TLI yes SDK no: 12 +# CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 # CHECK: == Total TLI yes SDK yes: 248 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) -# WRONG_DETAIL-COUNT-8: << TLI yes SDK no : '_Zn{{.*}}__hot_cold_t +# WRONG_DETAIL-COUNT-8: << TLI yes SDK no : {{.*}}__hot_cold_t # WRONG_DETAIL-COUNT-4: << TLI yes SDK no : '__size_returning_new{{.*}} -# WRONG_SUMMARY: << Total TLI yes SDK no: 13{{$}} +# WRONG_DETAIL: << TLI yes SDK no : 'fmaximum_num' +# WRONG_DETAIL: << TLI yes SDK no : 'fmaximum_numf' +# WRONG_DETAIL: << TLI yes SDK no : 'fmaximum_numl' +# WRONG_DETAIL: << TLI yes SDK no : 'fminimum_num' +# WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numf' +# WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' +# WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} # WRONG_SUMMARY: == Total TLI yes SDK yes: 247 # @@ -48,8 +54,8 @@ ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 493 symbols, 260 available -# AVAIL-COUNT-260: {{^}} available +# AVAIL: TLI knows 499 symbols, 266 available +# AVAIL-COUNT-266: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-233: not available # UNAVAIL-NOT: not available diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp index 68bf8e670771ee..ff7dec5bee31df 100644 --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -212,6 +212,12 @@ TEST_F(TargetLibraryInfoTest, ValidProto) { "declare double @fmin(double, double)\n" "declare float @fminf(float, float)\n" "declare x86_fp80 @fminl(x86_fp80, x86_fp80)\n" + "declare double @fmaximum_num(double, double)\n" + "declare float @fmaximum_numf(float, float)\n" + "declare x86_fp80 @fmaximum_numl(x86_fp80, x86_fp80)\n" + "declare double @fminimum_num(double, double)\n" + "declare float @fminimum_numf(float, float)\n" + "declare x86_fp80 @fminimum_numl(x86_fp80, x86_fp80)\n" "declare double @fmod(double, double)\n" "declare float @fmodf(float, float)\n" "declare x86_fp80 @fmodl(x86_fp80, x86_fp80)\n" diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index 074247e6e7d184..b9414be98623af 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -238,9 +238,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDLoc DL; auto Int32VT = EVT::getIntegerVT(Context, 32); auto Int64VT = EVT::getIntegerVT(Context, 64); + auto FloatVT = EVT::getFloatingPointVT(32); SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT); SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT); + SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, FloatVT); SDValue ZExt = DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op0); SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0); @@ -252,6 +254,9 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDValue VScale = DAG->getVScale(DL, Int32VT, APInt::getMaxValue(32)); + SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2); + SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2); + using namespace SDPatternMatch; EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value()))); EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value()))); @@ -263,6 +268,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { EXPECT_FALSE(sd_match(Sub, m_Neg(m_Value()))); EXPECT_FALSE(sd_match(Neg, m_Not(m_Value()))); EXPECT_TRUE(sd_match(VScale, m_VScale(m_Value()))); + + EXPECT_TRUE(sd_match(FPToUI, m_FPToUI(m_Value()))); + EXPECT_TRUE(sd_match(FPToSI, m_FPToSI(m_Value()))); + EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value()))); + EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value()))); } TEST_F(SelectionDAGPatternMatchTest, matchConstants) { diff --git a/llvm/unittests/IR/DataLayoutTest.cpp b/llvm/unittests/IR/DataLayoutTest.cpp index 113bb578f6bc3b..dcb2e614f4c40d 100644 --- a/llvm/unittests/IR/DataLayoutTest.cpp +++ b/llvm/unittests/IR/DataLayoutTest.cpp @@ -19,6 +19,111 @@ using namespace llvm; namespace { +// TODO: Split into multiple TESTs. +TEST(DataLayoutTest, ParseErrors) { + EXPECT_THAT_EXPECTED( + DataLayout::parse("^"), + FailedWithMessage("Unknown specifier in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("m:v"), + FailedWithMessage("Unknown mangling in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("n0"), + FailedWithMessage("Zero width native integer type in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p16777216:64:64:64"), + FailedWithMessage("Invalid address space, must be a 24-bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("a1:64"), + FailedWithMessage("Sized aggregate specification in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("a:"), + FailedWithMessage("Trailing separator in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p:48:52"), + FailedWithMessage("number of bits must be a byte width multiple")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("e-p"), + FailedWithMessage( + "Missing size specification for pointer in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("e-p:64"), + FailedWithMessage( + "Missing alignment specification for pointer in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("m"), + FailedWithMessage("Expected mangling specifier in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("m."), + FailedWithMessage("Unexpected trailing characters after mangling " + "specifier in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("f"), + FailedWithMessage( + "Missing alignment specification in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse(":32"), + FailedWithMessage( + "Expected token before separator in datalayout string")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i64:64:16"), + FailedWithMessage( + "Preferred alignment cannot be less than the ABI alignment")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i64:16:16777216"), + FailedWithMessage( + "Invalid preferred alignment, must be a 16bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i64:16777216:16777216"), + FailedWithMessage("Invalid ABI alignment, must be a 16bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i16777216:16:16"), + FailedWithMessage("Invalid bit width, must be a 24-bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p:32:32:16"), + FailedWithMessage( + "Preferred alignment cannot be less than the ABI alignment")); + EXPECT_THAT_EXPECTED(DataLayout::parse("p:0:32:32"), + FailedWithMessage("Invalid pointer size of 0 bytes")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p:64:24:64"), + FailedWithMessage("Pointer ABI alignment must be a power of 2")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p:64:64:24"), + FailedWithMessage("Pointer preferred alignment must be a power of 2")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("p:64:64:64:128"), + FailedWithMessage("Index width cannot be larger than pointer width")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("v128:0:128"), + FailedWithMessage( + "ABI alignment specification must be >0 for non-aggregate types")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i32:24:32"), + FailedWithMessage("Invalid ABI alignment, must be a power of 2")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i32:32:24"), + FailedWithMessage("Invalid preferred alignment, must be a power of 2")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("A16777216"), + FailedWithMessage("Invalid address space, must be a 24-bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("G16777216"), + FailedWithMessage("Invalid address space, must be a 24-bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("P16777216"), + FailedWithMessage("Invalid address space, must be a 24-bit integer")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("Fi24"), + FailedWithMessage("Alignment is neither 0 nor a power of 2")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("i8:16"), + FailedWithMessage("Invalid ABI alignment, i8 must be naturally aligned")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("S24"), + FailedWithMessage("Alignment is neither 0 nor a power of 2")); +} + TEST(DataLayoutTest, CopyAssignmentInvalidatesStructLayout) { DataLayout DL1 = cantFail(DataLayout::parse("p:32:32")); DataLayout DL2 = cantFail(DataLayout::parse("p:64:64")); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index caf306922847ed..1e4679ed6e802e 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1569,6 +1569,65 @@ define void @foo(ptr %ptr, <2 x ptr> %ptrs) { EXPECT_EQ(NewGEP2->getNextNode(), nullptr); } +TEST_F(SandboxIRTest, Flags) { + parseIR(C, R"IR( +define void @foo(i32 %arg, float %farg) { + %add = add i32 %arg, %arg + %fadd = fadd float %farg, %farg + %udiv = udiv i32 %arg, %arg + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMBB = &*LLVMF.begin(); + auto LLVMIt = LLVMBB->begin(); + auto *LLVMAdd = &*LLVMIt++; + auto *LLVMFAdd = &*LLVMIt++; + auto *LLVMUDiv = &*LLVMIt++; + + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *Add = &*It++; + auto *FAdd = &*It++; + auto *UDiv = &*It++; + +#define CHECK_FLAG(I, LLVMI, GETTER, SETTER) \ + { \ + EXPECT_EQ(I->GETTER(), LLVMI->GETTER()); \ + bool NewFlagVal = !I->GETTER(); \ + I->SETTER(NewFlagVal); \ + EXPECT_EQ(I->GETTER(), NewFlagVal); \ + EXPECT_EQ(I->GETTER(), LLVMI->GETTER()); \ + } + + CHECK_FLAG(Add, LLVMAdd, hasNoUnsignedWrap, setHasNoUnsignedWrap); + CHECK_FLAG(Add, LLVMAdd, hasNoSignedWrap, setHasNoSignedWrap); + CHECK_FLAG(FAdd, LLVMFAdd, isFast, setFast); + CHECK_FLAG(FAdd, LLVMFAdd, hasAllowReassoc, setHasAllowReassoc); + CHECK_FLAG(UDiv, LLVMUDiv, isExact, setIsExact); + CHECK_FLAG(FAdd, LLVMFAdd, hasNoNaNs, setHasNoNaNs); + CHECK_FLAG(FAdd, LLVMFAdd, hasNoInfs, setHasNoInfs); + CHECK_FLAG(FAdd, LLVMFAdd, hasNoSignedZeros, setHasNoSignedZeros); + CHECK_FLAG(FAdd, LLVMFAdd, hasAllowReciprocal, setHasAllowReciprocal); + CHECK_FLAG(FAdd, LLVMFAdd, hasAllowContract, setHasAllowContract); + CHECK_FLAG(FAdd, LLVMFAdd, hasApproxFunc, setHasApproxFunc); + + // Check getFastMathFlags(), copyFastMathFlags(). + FAdd->setFastMathFlags(FastMathFlags::getFast()); + EXPECT_FALSE(FAdd->getFastMathFlags() != LLVMFAdd->getFastMathFlags()); + FastMathFlags OrigFMF = FAdd->getFastMathFlags(); + FastMathFlags NewFMF; + NewFMF.setAllowReassoc(true); + EXPECT_TRUE(NewFMF != OrigFMF); + FAdd->setFastMathFlags(NewFMF); + EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); + FAdd->copyFastMathFlags(NewFMF); + EXPECT_FALSE(FAdd->getFastMathFlags() != NewFMF); + EXPECT_FALSE(FAdd->getFastMathFlags() != LLVMFAdd->getFastMathFlags()); +} + TEST_F(SandboxIRTest, AtomicCmpXchgInst) { parseIR(C, R"IR( define void @foo(ptr %ptr, i8 %cmp, i8 %new) { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index ba1d16de7d2120..c1f23c95cbfaed 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -968,3 +968,64 @@ define void @foo(ptr %arg0, i8 %val) { Ctx.revert(); EXPECT_FALSE(Store->isVolatile()); } + +TEST_F(TrackerTest, Flags) { + parseIR(C, R"IR( +define void @foo(i32 %arg, float %farg) { + %add = add i32 %arg, %arg + %fadd = fadd float %farg, %farg + %udiv = udiv i32 %arg, %arg + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *Add = &*It++; + auto *FAdd = &*It++; + auto *UDiv = &*It++; + +#define CHECK_FLAG(I, GETTER, SETTER) \ + { \ + Ctx.save(); \ + bool OrigFlag = I->GETTER(); \ + bool NewFlag = !OrigFlag; \ + I->SETTER(NewFlag); \ + EXPECT_EQ(I->GETTER(), NewFlag); \ + Ctx.revert(); \ + EXPECT_EQ(I->GETTER(), OrigFlag); \ + } + + CHECK_FLAG(Add, hasNoUnsignedWrap, setHasNoUnsignedWrap); + CHECK_FLAG(Add, hasNoSignedWrap, setHasNoSignedWrap); + CHECK_FLAG(FAdd, isFast, setFast); + CHECK_FLAG(FAdd, hasAllowReassoc, setHasAllowReassoc); + CHECK_FLAG(UDiv, isExact, setIsExact); + CHECK_FLAG(FAdd, hasNoNaNs, setHasNoNaNs); + CHECK_FLAG(FAdd, hasNoInfs, setHasNoInfs); + CHECK_FLAG(FAdd, hasNoSignedZeros, setHasNoSignedZeros); + CHECK_FLAG(FAdd, hasAllowReciprocal, setHasAllowReciprocal); + CHECK_FLAG(FAdd, hasAllowContract, setHasAllowContract); + CHECK_FLAG(FAdd, hasApproxFunc, setHasApproxFunc); + + // Check setFastMathFlags(). + FastMathFlags OrigFMF = FAdd->getFastMathFlags(); + FastMathFlags NewFMF; + NewFMF.setAllowReassoc(true); + EXPECT_TRUE(NewFMF != OrigFMF); + + Ctx.save(); + FAdd->setFastMathFlags(NewFMF); + EXPECT_FALSE(FAdd->getFastMathFlags() != NewFMF); + Ctx.revert(); + EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); + + // Check copyFastMathFlags(). + Ctx.save(); + FAdd->copyFastMathFlags(NewFMF); + EXPECT_FALSE(FAdd->getFastMathFlags() != NewFMF); + Ctx.revert(); + EXPECT_FALSE(FAdd->getFastMathFlags() != OrigFMF); +} diff --git a/llvm/utils/git/linkify b/llvm/utils/git/linkify new file mode 100755 index 00000000000000..9fcadd758492cf --- /dev/null +++ b/llvm/utils/git/linkify @@ -0,0 +1,20 @@ +#!/bin/sh + +# This script linkifies (i.e. makes clickable in the terminal) text that appears +# to be a pull request or issue reference (e.g. #12345 or PR12345) or a +# 40-character commit hash (e.g. abc123). You can configure git to automatically +# send the output of commands that pipe their output through a pager, such as +# `git log` and `git show`, through this script by running this command from +# within your LLVM checkout: +# +# git config core.pager 'llvm/utils/git/linkify | pager' +# +# The pager command is run from the root of the repository even if the git +# command is run from a subdirectory, so the relative path should always work. +# +# It requires OSC 8 support in the terminal. For a list of compatible terminals, +# see https://github.com/Alhadis/OSC8-Adoption + +sed \ + -e 's,\(#\|\bPR\)\([0-9]\+\),\x1b]8;;https://github.com/llvm/llvm-project/issues/\2\x1b\\\0\x1b]8;;\x1b\\,gi' \ + -e 's,[0-9a-f]\{40\},\x1b]8;;https://github.com/llvm/llvm-project/commit/\0\x1b\\\0\x1b]8;;\x1b\\,g' diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn index b8e2cc744dc013..1708af8612bc28 100644 --- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn @@ -94,7 +94,6 @@ static_library("AST") { "InheritViz.cpp", "Interp/ByteCodeEmitter.cpp", "Interp/Compiler.cpp", - "Interp/CompilerComplex.cpp", "Interp/Context.cpp", "Interp/Descriptor.cpp", "Interp/Disasm.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn index 34b7822f4f400f..3b640ae41b9f62 100644 --- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn @@ -75,7 +75,6 @@ static_library("Checkers") { "MacOSKeychainAPIChecker.cpp", "MacOSXAPIChecker.cpp", "MallocChecker.cpp", - "MallocOverflowSecurityChecker.cpp", "MallocSizeofChecker.cpp", "MismatchedIteratorChecker.cpp", "MmapWriteExecChecker.cpp", diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index da7fa86fd39173..cc903f9e3a1520 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -767,6 +767,10 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # echo-appending to a file. # FIXME: Standardize on the builtin echo implementation. We can use a # temporary file to sidestep blocking pipe write issues. + + # Ensure args[0] is hashable. + args[0] = expand_glob(args[0], cmd_shenv.cwd)[0] + inproc_builtin = inproc_builtins.get(args[0], None) if inproc_builtin and (args[0] != "echo" or len(cmd.commands) == 1): # env calling an in-process builtin is useless, so we take the safe diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/check_path.py b/llvm/utils/lit/tests/Inputs/check_path.py similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-shell/check_path.py rename to llvm/utils/lit/tests/Inputs/check_path.py diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/cat-error-0.txt b/llvm/utils/lit/tests/Inputs/shtest-cat/cat-error-0.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-shell/cat-error-0.txt rename to llvm/utils/lit/tests/Inputs/shtest-cat/cat-error-0.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/cat-error-1.txt b/llvm/utils/lit/tests/Inputs/shtest-cat/cat-error-1.txt similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-shell/cat-error-1.txt rename to llvm/utils/lit/tests/Inputs/shtest-cat/cat-error-1.txt diff --git a/llvm/utils/lit/tests/Inputs/shtest-cat/cat.txt b/llvm/utils/lit/tests/Inputs/shtest-cat/cat.txt new file mode 100644 index 00000000000000..7375a7497e5bec --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-cat/cat.txt @@ -0,0 +1,83 @@ +## Test cat command with a single file. +# +# RUN: rm -rf %T/testCat +# RUN: mkdir -p %T/testCat +# RUN: echo "abcdefgh" > %T/testCat/temp.write +# RUN: cat %T/testCat/temp.write > %T/testCat/tempcat.write +# RUN: %{python} %S/../check_path.py file %T/testCat/tempcat.write > %T/testCat/path.out +# RUN: FileCheck --check-prefix=FILE-EXISTS < %T/testCat/path.out %s +# RUN: FileCheck --check-prefix=CAT-OUTPUT < %T/testCat/tempcat.write %s +# FILE-EXISTS: True +# CAT-OUTPUT: abcdefgh +# +## Test cat command with multiple files. +# +# RUN: rm -rf %T/testCat +# RUN: mkdir -p %T/testCat +# RUN: echo "abcdefgh" > %T/testCat/temp1.write +# RUN: echo "efghijkl" > %T/testCat/temp2.write +# RUN: echo "mnopqrst" > %T/testCat/temp3.write +# RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write %T/testCat/temp3.write > %T/testCat/tempmulticat.write +# RUN: %{python} %S/../check_path.py file %T/testCat/tempmulticat.write > %T/testCat/path.out +# RUN: FileCheck --check-prefix=MULTI-FILE-EXISTS < %T/testCat/path.out %s +# RUN: FileCheck --check-prefix=MULTI-CAT-OUTPUT < %T/testCat/tempmulticat.write %s +# MULTI-FILE-EXISTS: True +# MULTI-CAT-OUTPUT: abcdefgh +# MULTI-CAT-OUTPUT-NEXT: efghijkl +# MULTI-CAT-OUTPUT-NEXT: mnopqrst +# +## Test cat command with multiple files and piped output to FileCheck. +# +# RUN: rm -rf %T/testCat +# RUN: mkdir -p %T/testCat +# RUN: echo "abcdefgh" > %T/testCat/temp1.write +# RUN: echo "efghijkl" > %T/testCat/temp2.write +# RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write | FileCheck --check-prefix=PIPED-CAT-OUTPUT %s +# PIPED-CAT-OUTPUT: abcdefgh +# PIPED-CAT-OUTPUT-NEXT: efghijkl +# +## Test cat command with multiple files and glob expressions. +# +# RUN: rm -rf %T/testCat +# RUN: mkdir -p %T/testCat +# RUN: echo "cvbnm" > %T/testCat/temp1.write +# RUN: echo "qwerty" > %T/testCat/temp2.write +# RUN: cat %T/testCat/*.write | FileCheck --check-prefix=GLOB-CAT-OUTPUT %s +# GLOB-CAT-OUTPUT: cvbnm +# GLOB-CAT-OUTPUT-NEXT: qwerty +# +## Test cat command with -v option +# +# RUN: cat -v %S/cat_nonprinting.bin | FileCheck --check-prefix=NP-CAT-OUTPUT %s +# NP-CAT-OUTPUT: ^@^A^B^C^D^E^F^G ^H +# NP-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S +# NP-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&' +# NP-CAT-OUTPUT-NEXT: ()*+,-./0123456789:; +# NP-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO +# NP-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc +# NP-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw +# NP-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K +# NP-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_ +# NP-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3 +# NP-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-M-?M-@M-AM-BM-CM-DM-EM-FM-G +# NP-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[ +# NP-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o +# NP-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^? +# +## Test cat command with -show-nonprinting option +# +# RUN: cat --show-nonprinting %S/cat_nonprinting.bin | FileCheck --check-prefix=NPLONG-CAT-OUTPUT %s +# NPLONG-CAT-OUTPUT: ^@^A^B^C^D^E^F^G ^H +# NPLONG-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S +# NPLONG-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&' +# NPLONG-CAT-OUTPUT-NEXT: ()*+,-./0123456789:; +# NPLONG-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO +# NPLONG-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc +# NPLONG-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw +# NPLONG-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K +# NPLONG-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_ +# NPLONG-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3 +# NPLONG-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-M-?M-@M-AM-BM-CM-DM-EM-FM-G +# NPLONG-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[ +# NPLONG-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o +# NPLONG-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^? diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/cat_nonprinting.bin b/llvm/utils/lit/tests/Inputs/shtest-cat/cat_nonprinting.bin similarity index 100% rename from llvm/utils/lit/tests/Inputs/shtest-shell/cat_nonprinting.bin rename to llvm/utils/lit/tests/Inputs/shtest-cat/cat_nonprinting.bin diff --git a/llvm/utils/lit/tests/Inputs/shtest-cat/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-cat/lit.cfg new file mode 100644 index 00000000000000..8f197946e28b5c --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-cat/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "shtest-cat" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None +config.substitutions.append(("%{python}", '"%s"' % (sys.executable))) diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt index 7267b9b9ef5aba..75ce8b7733ad7d 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt @@ -2,13 +2,13 @@ # Check force remove commands success whether the file does or doesn't exist. # # RUN: rm -f %t.write -# RUN: %{python} %S/check_path.py file %t.write > %t.out +# RUN: %{python} %S/../check_path.py file %t.write > %t.out # RUN: FileCheck --check-prefix=REMOVE-FILE < %t.out %s # RUN: echo "create a temp file" > %t.write -# RUN: %{python} %S/check_path.py file %t.write > %t.out +# RUN: %{python} %S/../check_path.py file %t.write > %t.out # RUN: FileCheck --check-prefix=FILE-EXIST < %t.out %s # RUN: rm -f %t.write -# RUN: %{python} %S/check_path.py file %t.write > %t.out +# RUN: %{python} %S/../check_path.py file %t.write > %t.out # RUN: FileCheck --check-prefix=REMOVE-FILE < %t.out %s # # REMOVE-FILE: False @@ -19,14 +19,14 @@ # # Check the mkdir command with -p option. # RUN: rm -f -r %T/test -# RUN: %{python} %S/check_path.py dir %T/test > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test > %t.out # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s # RUN: mkdir -p %T/test -# RUN: %{python} %S/check_path.py dir %T/test > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test > %t.out # RUN: FileCheck --check-prefix=MAKE-PARENT-DIR < %t.out %s # RUN: rm -f %T/test || true # RUN: rm -f -r %T/test -# RUN: %{python} %S/check_path.py dir %T/test > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test > %t.out # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s # # MAKE-PARENT-DIR: True @@ -36,13 +36,13 @@ # # RUN: rm -rf %T/test1 # RUN: mkdir %T/test1 -# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s # RUN: cd %T/test1 && mkdir foo -# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s # RUN: cd %T && rm -rf %T/test1 -# RUN: %{python} %S/check_path.py dir %T/test1 > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out # RUN: FileCheck --check-prefix=REMOVE-DIR < %t.out %s # # MAKE-DIR: True @@ -52,16 +52,16 @@ # # RUN: rm -rf %T/test # RUN: mkdir -p %T/test/test1 %T/test/test2 -# RUN: %{python} %S/check_path.py dir %T/test %T/test/test1 %T/test/test2 > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test %T/test/test1 %T/test/test2 > %t.out # RUN: FileCheck --check-prefix=DIRS-EXIST < %t.out %s # RUN: mkdir %T/test || true # RUN: echo "create a temp file" > %T/test/temp.write # RUN: echo "create a temp1 file" > %T/test/test1/temp1.write # RUN: echo "create a temp2 file" > %T/test/test2/temp2.write -# RUN: %{python} %S/check_path.py file %T/test/temp.write %T/test/test1/temp1.write %T/test/test2/temp2.write> %t.out +# RUN: %{python} %S/../check_path.py file %T/test/temp.write %T/test/test1/temp1.write %T/test/test2/temp2.write> %t.out # RUN: FileCheck --check-prefix=FILES-EXIST < %t.out %s # RUN: rm -r -f %T/* -# RUN: %{python} %S/check_path.py dir %T/test > %t.out +# RUN: %{python} %S/../check_path.py dir %T/test > %t.out # RUN: FileCheck --check-prefix=REMOVE-ALL < %t.out %s # # DIRS-EXIST: True @@ -85,87 +85,3 @@ # RUN: cd %T/dir1 && echo "hello" > temp1.txt # RUN: cd %T/dir2 && echo "hello" > temp2.txt # RUN: diff temp2.txt ../dir1/temp1.txt -# -# Check cat command with single file. -# -# RUN: rm -rf %T/testCat -# RUN: mkdir -p %T/testCat -# RUN: echo "abcdefgh" > %T/testCat/temp.write -# RUN: cat %T/testCat/temp.write > %T/testCat/tempcat.write -# RUN: %{python} %S/check_path.py file %T/testCat/tempcat.write > %T/testCat/path.out -# RUN: FileCheck --check-prefix=FILE-EXISTS < %T/testCat/path.out %s -# RUN: FileCheck --check-prefix=CAT-OUTPUT < %T/testCat/tempcat.write %s -# FILE-EXISTS: True -# CAT-OUTPUT: abcdefgh -# -# Check cat command with multiple files. -# -# RUN: rm -rf %T/testCat -# RUN: mkdir -p %T/testCat -# RUN: echo "abcdefgh" > %T/testCat/temp1.write -# RUN: echo "efghijkl" > %T/testCat/temp2.write -# RUN: echo "mnopqrst" > %T/testCat/temp3.write -# RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write %T/testCat/temp3.write > %T/testCat/tempmulticat.write -# RUN: %{python} %S/check_path.py file %T/testCat/tempmulticat.write > %T/testCat/path.out -# RUN: FileCheck --check-prefix=MULTI-FILE-EXISTS < %T/testCat/path.out %s -# RUN: FileCheck --check-prefix=MULTI-CAT-OUTPUT < %T/testCat/tempmulticat.write %s -# MULTI-FILE-EXISTS: True -# MULTI-CAT-OUTPUT: abcdefgh -# MULTI-CAT-OUTPUT-NEXT: efghijkl -# MULTI-CAT-OUTPUT-NEXT: mnopqrst -# -# Check cat command with multiple files and piped output to FileCheck. -# -# RUN: rm -rf %T/testCat -# RUN: mkdir -p %T/testCat -# RUN: echo "abcdefgh" > %T/testCat/temp1.write -# RUN: echo "efghijkl" > %T/testCat/temp2.write -# RUN: cat %T/testCat/temp1.write %T/testCat/temp2.write | FileCheck --check-prefix=PIPED-CAT-OUTPUT %s -# PIPED-CAT-OUTPUT: abcdefgh -# PIPED-CAT-OUTPUT-NEXT: efghijkl -# -# Check cat command with multiple files and glob expressions. -# -# RUN: rm -rf %T/testCat -# RUN: mkdir -p %T/testCat -# RUN: echo "cvbnm" > %T/testCat/temp1.write -# RUN: echo "qwerty" > %T/testCat/temp2.write -# RUN: cat %T/testCat/*.write | FileCheck --check-prefix=GLOB-CAT-OUTPUT %s -# GLOB-CAT-OUTPUT: cvbnm -# GLOB-CAT-OUTPUT-NEXT: qwerty -# -# Check cat command with -v option -# -# RUN: cat -v %S/cat_nonprinting.bin | FileCheck --check-prefix=NP-CAT-OUTPUT %s -# NP-CAT-OUTPUT: ^@^A^B^C^D^E^F^G ^H -# NP-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S -# NP-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&' -# NP-CAT-OUTPUT-NEXT: ()*+,-./0123456789:; -# NP-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO -# NP-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc -# NP-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw -# NP-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K -# NP-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_ -# NP-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3 -# NP-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-M-?M-@M-AM-BM-CM-DM-EM-FM-G -# NP-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[ -# NP-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o -# NP-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^? -# -# Check cat command with -show-nonprinting option -# -# RUN: cat --show-nonprinting %S/cat_nonprinting.bin | FileCheck --check-prefix=NPLONG-CAT-OUTPUT %s -# NPLONG-CAT-OUTPUT: ^@^A^B^C^D^E^F^G ^H -# NPLONG-CAT-OUTPUT-NEXT: ^K^L^M^N^O^P^Q^R^S -# NPLONG-CAT-OUTPUT-NEXT: ^T^U^V^W^X^Y^Z^[^\^]^^^_ !"#$%&' -# NPLONG-CAT-OUTPUT-NEXT: ()*+,-./0123456789:; -# NPLONG-CAT-OUTPUT-NEXT: <=>?@ABCDEFGHIJKLMNO -# NPLONG-CAT-OUTPUT-NEXT: PQRSTUVWXYZ[\]^_`abc -# NPLONG-CAT-OUTPUT-NEXT: defghijklmnopqrstuvw -# NPLONG-CAT-OUTPUT-NEXT: xyz{|}~^?M-^@M-^AM-^BM-^CM-^DM-^EM-^FM-^GM-^HM-^IM-^JM-^K -# NPLONG-CAT-OUTPUT-NEXT: M-^LM-^MM-^NM-^OM-^PM-^QM-^RM-^SM-^TM-^UM-^VM-^WM-^XM-^YM-^ZM-^[M-^\M-^]M-^^M-^_ -# NPLONG-CAT-OUTPUT-NEXT: M- M-!M-"M-#M-$M-%M-&M-'M-(M-)M-*M-+M-,M--M-.M-/M-0M-1M-2M-3 -# NPLONG-CAT-OUTPUT-NEXT: M-4M-5M-6M-7M-8M-9M-:M-;M-M-?M-@M-AM-BM-CM-DM-EM-FM-G -# NPLONG-CAT-OUTPUT-NEXT: M-HM-IM-JM-KM-LM-MM-NM-OM-PM-QM-RM-SM-TM-UM-VM-WM-XM-YM-ZM-[ -# NPLONG-CAT-OUTPUT-NEXT: M-\M-]M-^M-_M-`M-aM-bM-cM-dM-eM-fM-gM-hM-iM-jM-kM-lM-mM-nM-o -# NPLONG-CAT-OUTPUT-NEXT: M-pM-qM-rM-sM-tM-uM-vM-wM-xM-yM-zM-{M-|M-}M-~M-^? diff --git a/llvm/utils/lit/tests/shtest-cat.py b/llvm/utils/lit/tests/shtest-cat.py new file mode 100644 index 00000000000000..5efe25c41684a1 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-cat.py @@ -0,0 +1,23 @@ +## Test the cat command. +# +# RUN: not %{lit} -a -v %{inputs}/shtest-cat \ +# RUN: | FileCheck -match-full-lines %s +# END. + +# CHECK: FAIL: shtest-cat :: cat-error-0.txt ({{[^)]*}}) +# CHECK: cat -b temp1.txt +# CHECK: # .---command stderr{{-*}} +# CHECK-NEXT: # | Unsupported: 'cat': option -b not recognized +# CHECK: # error: command failed with exit status: 1 + +# CHECK: FAIL: shtest-cat :: cat-error-1.txt ({{[^)]*}}) +# CHECK: cat temp1.txt +# CHECK: # .---command stderr{{-*}} +# CHECK-NEXT: # | [Errno 2] No such file or directory: 'temp1.txt' +# CHECK: # error: command failed with exit status: 1 + +# CHECK: PASS: shtest-cat :: cat.txt ({{[^)]*}}) + +# CHECK: Total Discovered Tests: 3 +# CHECK-NEXT: Passed: 1 {{\([0-9]*\.[0-9]*%\)}} +# CHECK-NEXT: Failed: 2 {{\([0-9]*\.[0-9]*%\)}} diff --git a/llvm/utils/lit/tests/shtest-shell.py b/llvm/utils/lit/tests/shtest-shell.py index 86851194880620..8f2b865f333a57 100644 --- a/llvm/utils/lit/tests/shtest-shell.py +++ b/llvm/utils/lit/tests/shtest-shell.py @@ -18,22 +18,6 @@ # CHECK: -- Testing: -# CHECK: FAIL: shtest-shell :: cat-error-0.txt -# CHECK: *** TEST 'shtest-shell :: cat-error-0.txt' FAILED *** -# CHECK: cat -b temp1.txt -# CHECK: # .---command stderr{{-*}} -# CHECK: # | Unsupported: 'cat': option -b not recognized -# CHECK: # error: command failed with exit status: 1 -# CHECK: *** - -# CHECK: FAIL: shtest-shell :: cat-error-1.txt -# CHECK: *** TEST 'shtest-shell :: cat-error-1.txt' FAILED *** -# CHECK: cat temp1.txt -# CHECK: # .---command stderr{{-*}} -# CHECK: # | [Errno 2] No such file or directory: 'temp1.txt' -# CHECK: # error: command failed with exit status: 1 -# CHECK: *** - # CHECK: FAIL: shtest-shell :: colon-error.txt # CHECK: *** TEST 'shtest-shell :: colon-error.txt' FAILED *** # CHECK: : @@ -651,4 +635,4 @@ # CHECK: PASS: shtest-shell :: valid-shell.txt # CHECK: Unresolved Tests (1) -# CHECK: Failed Tests (38) +# CHECK: Failed Tests (36) diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp index c244b31434ba98..dccab91944fe1d 100644 --- a/mlir/examples/toy/Ch6/toyc.cpp +++ b/mlir/examples/toy/Ch6/toyc.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Func/Extensions/AllExtensions.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" #include "toy/AST.h" #include "toy/Dialect.h" #include "toy/Lexer.h" @@ -299,6 +300,7 @@ int main(int argc, char **argv) { // If we aren't dumping the AST, then we are compiling with/to MLIR. mlir::DialectRegistry registry; mlir::func::registerAllExtensions(registry); + mlir::LLVM::registerInlinerInterface(registry); mlir::MLIRContext context(registry); // Load our Dialect in this MLIR Context. diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp index fea56796adf10e..dd862656b9db88 100644 --- a/mlir/examples/toy/Ch7/toyc.cpp +++ b/mlir/examples/toy/Ch7/toyc.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Func/Extensions/AllExtensions.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" #include "toy/AST.h" #include "toy/Dialect.h" #include "toy/Lexer.h" @@ -300,6 +301,7 @@ int main(int argc, char **argv) { // If we aren't dumping the AST, then we are compiling with/to MLIR. mlir::DialectRegistry registry; mlir::func::registerAllExtensions(registry); + mlir::LLVM::registerInlinerInterface(registry); mlir::MLIRContext context(registry); // Load our Dialect in this MLIR Context. diff --git a/mlir/include/mlir/Analysis/SliceWalk.h b/mlir/include/mlir/Analysis/SliceWalk.h new file mode 100644 index 00000000000000..481c5690c533ba --- /dev/null +++ b/mlir/include/mlir/Analysis/SliceWalk.h @@ -0,0 +1,98 @@ +//===- SliceWalk.h - Helpers for performing IR slice walks ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_ANALYSIS_SLICEWALK_H +#define MLIR_ANALYSIS_SLICEWALK_H + +#include "mlir/IR/ValueRange.h" + +namespace mlir { + +/// A class to signal how to proceed with the walk of the backward slice: +/// - Interrupt: Stops the walk. +/// - AdvanceTo: Continues the walk to user-specified values. +/// - Skip: Continues the walk, but skips the predecessors of the current value. +class WalkContinuation { +public: + enum class WalkAction { + /// Stops the walk. + Interrupt, + /// Continues the walk to user-specified values. + AdvanceTo, + /// Continues the walk, but skips the predecessors of the current value. + Skip + }; + + WalkContinuation(WalkAction action, mlir::ValueRange nextValues) + : action(action), nextValues(nextValues) {} + + /// Allows diagnostics to interrupt the walk. + explicit WalkContinuation(mlir::Diagnostic &&) + : action(WalkAction::Interrupt) {} + + /// Allows diagnostics to interrupt the walk. + explicit WalkContinuation(mlir::InFlightDiagnostic &&) + : action(WalkAction::Interrupt) {} + + /// Creates a continuation that interrupts the walk. + static WalkContinuation interrupt() { + return WalkContinuation(WalkAction::Interrupt, {}); + } + + /// Creates a continuation that adds the user-specified `nextValues` to the + /// work list and advances the walk. + static WalkContinuation advanceTo(mlir::ValueRange nextValues) { + return WalkContinuation(WalkAction::AdvanceTo, nextValues); + } + + /// Creates a continuation that advances the walk without adding any + /// predecessor values to the work list. + static WalkContinuation skip() { + return WalkContinuation(WalkAction::Skip, {}); + } + + /// Returns true if the walk was interrupted. + bool wasInterrupted() const { return action == WalkAction::Interrupt; } + + /// Returns true if the walk was skipped. + bool wasSkipped() const { return action == WalkAction::Skip; } + + /// Returns true if the walk was advanced to user-specified values. + bool wasAdvancedTo() const { return action == WalkAction::AdvanceTo; } + + /// Returns the next values to continue the walk with. + mlir::ArrayRef getNextValues() const { return nextValues; } + +private: + WalkAction action; + /// The next values to continue the walk with. + mlir::SmallVector nextValues; +}; + +/// A callback that is invoked for each value encountered during the walk of the +/// slice. The callback takes the current value, and returns the walk +/// continuation, which determines if the walk should proceed and if yes, with +/// which values. +using WalkCallback = mlir::function_ref; + +/// Walks the slice starting from the `rootValues` using a depth-first +/// traversal. The walk calls the provided `walkCallback` for each value +/// encountered in the slice and uses the returned walk continuation to +/// determine how to proceed. +WalkContinuation walkSlice(mlir::ValueRange rootValues, + WalkCallback walkCallback); + +/// Computes a vector of all control predecessors of `value`. Relies on +/// RegionBranchOpInterface and BranchOpInterface to determine predecessors. +/// Returns nullopt if `value` has no predecessors or when the relevant +/// operations are missing the interface implementations. +std::optional> getControlFlowPredecessors(Value value); + +} // namespace mlir + +#endif // MLIR_ANALYSIS_SLICEWALK_H diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 868208ff74a521..b81b7e577b213a 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -57,7 +57,8 @@ def ROCDL_Dialect : Dialect { "::mlir::DenseI32ArrayAttr":$reqd_work_group_size, "::mlir::StringAttr":$flat_work_group_size, "::mlir::IntegerAttr":$max_flat_work_group_size, - "::mlir::IntegerAttr":$waves_per_eu + "::mlir::IntegerAttr":$waves_per_eu, + "::mlir::BoolAttr":$unsafe_fp_atomics ); let useDefaultAttributePrinterParser = 1; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h similarity index 65% rename from mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h rename to mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h index c6f75d5657c3be..e99b0476a6b107 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.h +++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h @@ -1,4 +1,4 @@ -//===- LLVMInlining.h - Registration of LLVMInlinerInterface ----*- C++ -*-===// +//===- InlinerInterfaceImpl.h - Inlining for LLVM the dialect ---*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,28 +6,23 @@ // //===----------------------------------------------------------------------===// // -// Allows registering the LLVM DialectInlinerInterface with the LLVM dialect -// during initialization. +// Allows registering the LLVM DialectInlinerInterface with the LLVM dialect. // //===----------------------------------------------------------------------===// -#ifndef DIALECT_LLVMIR_IR_LLVMINLINING_H -#define DIALECT_LLVMIR_IR_LLVMINLINING_H +#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H +#define MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H namespace mlir { -namespace LLVM { - -class LLVMDialect; +class DialectRegistry; -namespace detail { +namespace LLVM { /// Register the `LLVMInlinerInterface` implementation of /// `DialectInlinerInterface` with the LLVM dialect. -void addLLVMInlinerInterface(LLVMDialect *dialect); - -} // namespace detail +void registerInlinerInterface(DialectRegistry ®istry); } // namespace LLVM } // namespace mlir -#endif // DIALECT_LLVMIR_IR_LLVMINLINING_H +#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h index 7f983b8b3cfd06..ae695e0326ca1a 100644 --- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h @@ -67,6 +67,10 @@ void populateDropRedundantInsertSliceRankExpansionPatterns( /// `tensor.collapse_shape` into other ops. void populateReassociativeReshapeFoldingPatterns(RewritePatternSet &patterns); +/// Populates `patterns` with patterns that bubble up `tensor.expand_shape` +/// through `tensor.collapse_shape` ops. +void populateBubbleUpExpandShapePatterns(RewritePatternSet &patterns); + /// Populates `patterns` with patterns that fold tensor.empty with its /// consumers. /// diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 1cad0dd3042bee..9be7f8ef9ef6b7 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -903,6 +903,8 @@ def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> { let assemblyFormat = [{ $input `,` $table attr-dict `:` `(` type($input) `,` type($table) `)` `->` type($output) }]; + + let hasVerifier = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/Matchers.h b/mlir/include/mlir/IR/Matchers.h index f6417f62d09e8c..6fa5a47109d20d 100644 --- a/mlir/include/mlir/IR/Matchers.h +++ b/mlir/include/mlir/IR/Matchers.h @@ -18,6 +18,7 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/InferIntRangeInterface.h" namespace mlir { @@ -100,6 +101,39 @@ struct constant_op_binder { } }; +/// A matcher that matches operations that implement the +/// `InferIntRangeInterface` interface, and binds the inferred range. +struct infer_int_range_op_binder { + IntegerValueRange *bind_value; + + explicit infer_int_range_op_binder(IntegerValueRange *bind_value) + : bind_value(bind_value) {} + + bool match(Operation *op) { + auto inferIntRangeOp = dyn_cast(op); + if (!inferIntRangeOp) + return false; + + // Set the range of all integer operands to the maximal range. + SmallVector argRanges = + llvm::map_to_vector(op->getOperands(), IntegerValueRange::getMaxRange); + + // Infer the result result range if possible. + bool matched = false; + auto setResultRanges = [&](Value value, + const IntegerValueRange &argRanges) { + if (argRanges.isUninitialized()) + return; + if (value != op->getResult(0)) + return; + *bind_value = argRanges; + matched = true; + }; + inferIntRangeOp.inferResultRangesFromOptional(argRanges, setResultRanges); + return matched; + } +}; + /// The matcher that matches operations that have the specified attribute /// name, and binds the attribute value. template @@ -219,6 +253,31 @@ struct constant_int_predicate_matcher { } }; +/// A matcher that matches a given a constant scalar / vector splat / tensor +/// splat integer value or a constant integer range that fulfills a predicate. +struct constant_int_range_predicate_matcher { + bool (*predicate)(const ConstantIntRanges &); + + bool match(Attribute attr) { + APInt value; + return constant_int_value_binder(&value).match(attr) && + predicate(ConstantIntRanges::constant(value)); + } + + bool match(Operation *op) { + // Try to match a constant integer value first. + APInt value; + if (constant_int_value_binder(&value).match(op)) + return predicate(ConstantIntRanges::constant(value)); + + // Otherwise, try to match an operation that implements the + // `InferIntRangeInterface` interface. + IntegerValueRange range; + return infer_int_range_op_binder(&range).match(op) && + predicate(range.getValue()); + } +}; + /// The matcher that matches a certain kind of op. template struct op_matcher { @@ -385,6 +444,31 @@ inline detail::constant_int_predicate_matcher m_NonZero() { return {[](const APInt &value) { return 0 != value; }}; } +/// Matches a constant scalar / vector splat / tensor splat integer or a +/// unsigned integer range that does not contain zero. Note that this matcher +/// interprets the target value as an unsigned integer. +inline detail::constant_int_range_predicate_matcher m_IntRangeWithoutZeroU() { + return {[](const ConstantIntRanges &range) { return range.umin().ugt(0); }}; +} + +/// Matches a constant scalar / vector splat / tensor splat integer or a +/// signed integer range that does not contain zero. Note that this matcher +/// interprets the target value as a signed integer. +inline detail::constant_int_range_predicate_matcher m_IntRangeWithoutZeroS() { + return {[](const ConstantIntRanges &range) { + return range.smin().sgt(0) || range.smax().slt(0); + }}; +} + +/// Matches a constant scalar / vector splat / tensor splat integer or a +/// signed integer range that does not contain minus one. Note +/// that this matcher interprets the target value as a signed integer. +inline detail::constant_int_range_predicate_matcher m_IntRangeWithoutNegOneS() { + return {[](const ConstantIntRanges &range) { + return range.smin().sgt(-1) || range.smax().slt(-1); + }}; +} + /// Matches a constant scalar / vector splat / tensor splat integer one. inline detail::constant_int_predicate_matcher m_One() { return {[](const APInt &value) { return 1 == value; }}; diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index 01f28c5d21b37d..ab81832cdbee55 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -43,6 +43,7 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h" #include "mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h" @@ -164,6 +165,7 @@ inline void registerAllDialects(DialectRegistry ®istry) { cf::registerBufferizableOpInterfaceExternalModels(registry); cf::registerBufferDeallocationOpInterfaceExternalModels(registry); gpu::registerBufferDeallocationOpInterfaceExternalModels(registry); + LLVM::registerInlinerInterface(registry); linalg::registerAllDialectInterfaceImplementations(registry); linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry); memref::registerAllocationOpInterfaceExternalModels(registry); diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt index 38d8415d81c72d..609cb34309829e 100644 --- a/mlir/lib/Analysis/CMakeLists.txt +++ b/mlir/lib/Analysis/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_library(MLIRAnalysis Liveness.cpp CFGLoopInfo.cpp SliceAnalysis.cpp + SliceWalk.cpp TopologicalSortUtils.cpp AliasAnalysis/LocalAliasAnalysis.cpp diff --git a/mlir/lib/Analysis/SliceWalk.cpp b/mlir/lib/Analysis/SliceWalk.cpp new file mode 100644 index 00000000000000..9d770639dc53ca --- /dev/null +++ b/mlir/lib/Analysis/SliceWalk.cpp @@ -0,0 +1,139 @@ +#include "mlir/Analysis/SliceWalk.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" + +using namespace mlir; + +WalkContinuation mlir::walkSlice(ValueRange rootValues, + WalkCallback walkCallback) { + // Search the backward slice starting from the root values. + SmallVector workList = rootValues; + llvm::SmallDenseSet seenValues; + while (!workList.empty()) { + // Search the backward slice of the current value. + Value current = workList.pop_back_val(); + + // Skip the current value if it has already been seen. + if (!seenValues.insert(current).second) + continue; + + // Call the walk callback with the current value. + WalkContinuation continuation = walkCallback(current); + if (continuation.wasInterrupted()) + return continuation; + if (continuation.wasSkipped()) + continue; + + assert(continuation.wasAdvancedTo()); + // Add the next values to the work list if the walk should continue. + workList.append(continuation.getNextValues().begin(), + continuation.getNextValues().end()); + } + + return WalkContinuation::skip(); +} + +/// Returns the operands from all predecessor regions that match `operandNumber` +/// for the `successor` region within `regionOp`. +static SmallVector +getRegionPredecessorOperands(RegionBranchOpInterface regionOp, + RegionSuccessor successor, + unsigned operandNumber) { + SmallVector predecessorOperands; + + // Returns true if `successors` contains `successor`. + auto isContained = [](ArrayRef successors, + RegionSuccessor successor) { + auto *it = llvm::find_if(successors, [&successor](RegionSuccessor curr) { + return curr.getSuccessor() == successor.getSuccessor(); + }); + return it != successors.end(); + }; + + // Search the operand ranges on the region operation itself. + SmallVector operandAttributes(regionOp->getNumOperands()); + SmallVector successors; + regionOp.getEntrySuccessorRegions(operandAttributes, successors); + if (isContained(successors, successor)) { + OperandRange operands = regionOp.getEntrySuccessorOperands(successor); + predecessorOperands.push_back(operands[operandNumber]); + } + + // Search the operand ranges on region terminators. + for (Region ®ion : regionOp->getRegions()) { + for (Block &block : region) { + auto terminatorOp = + dyn_cast(block.getTerminator()); + if (!terminatorOp) + continue; + SmallVector operandAttributes(terminatorOp->getNumOperands()); + SmallVector successors; + terminatorOp.getSuccessorRegions(operandAttributes, successors); + if (isContained(successors, successor)) { + OperandRange operands = terminatorOp.getSuccessorOperands(successor); + predecessorOperands.push_back(operands[operandNumber]); + } + } + } + + return predecessorOperands; +} + +/// Returns the predecessor branch operands that match `blockArg`, or nullopt if +/// some of the predecessor terminators do not implement the BranchOpInterface. +static std::optional> +getBlockPredecessorOperands(BlockArgument blockArg) { + Block *block = blockArg.getOwner(); + + // Search the predecessor operands for all predecessor terminators. + SmallVector predecessorOperands; + for (auto it = block->pred_begin(); it != block->pred_end(); ++it) { + Block *predecessor = *it; + auto branchOp = dyn_cast(predecessor->getTerminator()); + if (!branchOp) + return std::nullopt; + SuccessorOperands successorOperands = + branchOp.getSuccessorOperands(it.getSuccessorIndex()); + // Store the predecessor operand if the block argument matches an operand + // and is not produced by the terminator. + if (Value operand = successorOperands[blockArg.getArgNumber()]) + predecessorOperands.push_back(operand); + } + + return predecessorOperands; +} + +std::optional> +mlir::getControlFlowPredecessors(Value value) { + SmallVector result; + if (OpResult opResult = dyn_cast(value)) { + auto regionOp = dyn_cast(opResult.getOwner()); + // If the interface is not implemented, there are no control flow + // predecessors to work with. + if (!regionOp) + return std::nullopt; + // Add the control flow predecessor operands to the work list. + RegionSuccessor region(regionOp->getResults()); + SmallVector predecessorOperands = getRegionPredecessorOperands( + regionOp, region, opResult.getResultNumber()); + return predecessorOperands; + } + + auto blockArg = cast(value); + Block *block = blockArg.getOwner(); + // Search the region predecessor operands for structured control flow. + if (block->isEntryBlock()) { + if (auto regionBranchOp = + dyn_cast(block->getParentOp())) { + RegionSuccessor region(blockArg.getParentRegion()); + SmallVector predecessorOperands = getRegionPredecessorOperands( + regionBranchOp, region, blockArg.getArgNumber()); + return predecessorOperands; + } + // If the interface is not implemented, there are no control flow + // predecessors to work with. + return std::nullopt; + } + + // Search the block predecessor operands for unstructured control flow. + return getBlockPredecessorOperands(blockArg); +} diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index 641b7d7e2d13be..254f54d9e459e1 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -595,10 +595,17 @@ OpFoldResult arith::DivUIOp::fold(FoldAdaptor adaptor) { return div0 ? Attribute() : result; } -Speculation::Speculatability arith::DivUIOp::getSpeculatability() { +/// Returns whether an unsigned division by `divisor` is speculatable. +static Speculation::Speculatability getDivUISpeculatability(Value divisor) { // X / 0 => UB - return matchPattern(getRhs(), m_NonZero()) ? Speculation::Speculatable - : Speculation::NotSpeculatable; + if (matchPattern(divisor, m_IntRangeWithoutZeroU())) + return Speculation::Speculatable; + + return Speculation::NotSpeculatable; +} + +Speculation::Speculatability arith::DivUIOp::getSpeculatability() { + return getDivUISpeculatability(getRhs()); } //===----------------------------------------------------------------------===// @@ -624,16 +631,21 @@ OpFoldResult arith::DivSIOp::fold(FoldAdaptor adaptor) { return overflowOrDiv0 ? Attribute() : result; } -Speculation::Speculatability arith::DivSIOp::getSpeculatability() { - bool mayHaveUB = true; - - APInt constRHS; +/// Returns whether a signed division by `divisor` is speculatable. This +/// function conservatively assumes that all signed division by -1 are not +/// speculatable. +static Speculation::Speculatability getDivSISpeculatability(Value divisor) { // X / 0 => UB // INT_MIN / -1 => UB - if (matchPattern(getRhs(), m_ConstantInt(&constRHS))) - mayHaveUB = constRHS.isAllOnes() || constRHS.isZero(); + if (matchPattern(divisor, m_IntRangeWithoutZeroS()) && + matchPattern(divisor, m_IntRangeWithoutNegOneS())) + return Speculation::Speculatable; - return mayHaveUB ? Speculation::NotSpeculatable : Speculation::Speculatable; + return Speculation::NotSpeculatable; +} + +Speculation::Speculatability arith::DivSIOp::getSpeculatability() { + return getDivSISpeculatability(getRhs()); } //===----------------------------------------------------------------------===// @@ -675,9 +687,7 @@ OpFoldResult arith::CeilDivUIOp::fold(FoldAdaptor adaptor) { } Speculation::Speculatability arith::CeilDivUIOp::getSpeculatability() { - // X / 0 => UB - return matchPattern(getRhs(), m_NonZero()) ? Speculation::Speculatable - : Speculation::NotSpeculatable; + return getDivUISpeculatability(getRhs()); } //===----------------------------------------------------------------------===// @@ -746,15 +756,7 @@ OpFoldResult arith::CeilDivSIOp::fold(FoldAdaptor adaptor) { } Speculation::Speculatability arith::CeilDivSIOp::getSpeculatability() { - bool mayHaveUB = true; - - APInt constRHS; - // X / 0 => UB - // INT_MIN / -1 => UB - if (matchPattern(getRhs(), m_ConstantInt(&constRHS))) - mayHaveUB = constRHS.isAllOnes() || constRHS.isZero(); - - return mayHaveUB ? Speculation::NotSpeculatable : Speculation::Speculatable; + return getDivSISpeculatability(getRhs()); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt index 392065b859ee54..fce24b556036f3 100644 --- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt @@ -4,7 +4,6 @@ add_mlir_dialect_library(MLIRLLVMDialect IR/FunctionCallUtils.cpp IR/LLVMAttrs.cpp IR/LLVMDialect.cpp - IR/LLVMInlining.cpp IR/LLVMInterfaces.cpp IR/LLVMMemorySlot.cpp IR/LLVMTypes.cpp diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 07262bb8e1bacb..92f3984e5e6db6 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "LLVMInlining.h" #include "TypeDetail.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h" @@ -24,6 +23,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/FunctionImplementation.h" +#include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/TypeSwitch.h" @@ -3252,7 +3252,7 @@ void LLVMDialect::initialize() { // clang-format off addInterfaces(); // clang-format on - detail::addLLVMInlinerInterface(this); + declarePromisedInterface(); } #define GET_OP_CLASSES diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt index 728885fcbeaf38..d4ff0955c5d0e2 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms DIExpressionLegalization.cpp DIExpressionRewriter.cpp DIScopeForLLVMFuncOp.cpp + InlinerInterfaceImpl.cpp LegalizeForExport.cpp OptimizeForNVVM.cpp RequestCWrappers.cpp diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp similarity index 92% rename from mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp rename to mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index 137c1962b100af..504f63b48c9433 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -1,4 +1,4 @@ -//===- LLVMInlining.cpp - LLVM inlining interface and logic -----*- C++ -*-===// +//===- InlinerInterfaceImpl.cpp - Inlining for LLVM the dialect -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "LLVMInlining.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" +#include "mlir/Analysis/SliceWalk.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" @@ -221,86 +222,45 @@ static ArrayAttr concatArrayAttr(ArrayAttr lhs, ArrayAttr rhs) { return ArrayAttr::get(lhs.getContext(), result); } -/// Attempts to return the underlying pointer value that `pointerValue` is based -/// on. This traverses down the chain of operations to the last operation -/// producing the base pointer and returns it. If it encounters an operation it -/// cannot further traverse through, returns the operation's result. -static Value getUnderlyingObject(Value pointerValue) { - while (true) { - if (auto gepOp = pointerValue.getDefiningOp()) { - pointerValue = gepOp.getBase(); - continue; - } - - if (auto addrCast = pointerValue.getDefiningOp()) { - pointerValue = addrCast.getOperand(); - continue; - } - - break; - } - - return pointerValue; -} - /// Attempts to return the set of all underlying pointer values that /// `pointerValue` is based on. This function traverses through select -/// operations and block arguments unlike getUnderlyingObject. -static SmallVector getUnderlyingObjectSet(Value pointerValue) { +/// operations and block arguments. +static FailureOr> +getUnderlyingObjectSet(Value pointerValue) { SmallVector result; - - SmallVector workList{pointerValue}; - // Avoid dataflow loops. - SmallPtrSet seen; - do { - Value current = workList.pop_back_val(); - current = getUnderlyingObject(current); - - if (!seen.insert(current).second) - continue; - - if (auto selectOp = current.getDefiningOp()) { - workList.push_back(selectOp.getTrueValue()); - workList.push_back(selectOp.getFalseValue()); - continue; + WalkContinuation walkResult = walkSlice(pointerValue, [&](Value val) { + if (auto gepOp = val.getDefiningOp()) + return WalkContinuation::advanceTo(gepOp.getBase()); + + if (auto addrCast = val.getDefiningOp()) + return WalkContinuation::advanceTo(addrCast.getOperand()); + + // TODO: Add a SelectLikeOpInterface and use it in the slicing utility. + if (auto selectOp = val.getDefiningOp()) + return WalkContinuation::advanceTo( + {selectOp.getTrueValue(), selectOp.getFalseValue()}); + + // Attempt to advance to control flow predecessors. + std::optional> controlFlowPredecessors = + getControlFlowPredecessors(val); + if (controlFlowPredecessors) + return WalkContinuation::advanceTo(*controlFlowPredecessors); + + // For all non-control flow results, consider `val` an underlying object. + if (isa(val)) { + result.push_back(val); + return WalkContinuation::skip(); } - if (auto blockArg = dyn_cast(current)) { - Block *parentBlock = blockArg.getParentBlock(); - - // Attempt to find all block argument operands for every predecessor. - // If any operand to the block argument wasn't found in a predecessor, - // conservatively add the block argument to the result set. - SmallVector operands; - bool anyUnknown = false; - for (auto iter = parentBlock->pred_begin(); - iter != parentBlock->pred_end(); iter++) { - auto branch = dyn_cast((*iter)->getTerminator()); - if (!branch) { - result.push_back(blockArg); - anyUnknown = true; - break; - } - - Value operand = branch.getSuccessorOperands( - iter.getSuccessorIndex())[blockArg.getArgNumber()]; - if (!operand) { - result.push_back(blockArg); - anyUnknown = true; - break; - } - - operands.push_back(operand); - } - - if (!anyUnknown) - llvm::append_range(workList, operands); - - continue; - } + // If this place is reached, `val` is a block argument that is not + // understood. Therefore, we conservatively interrupt. + // Note: Dealing with function arguments is not necessary, as the slice + // would have to go through an SSACopyOp first. + return WalkContinuation::interrupt(); + }); - result.push_back(current); - } while (!workList.empty()); + if (walkResult.wasInterrupted()) + return failure(); return result; } @@ -363,9 +323,14 @@ static void createNewAliasScopesFromNoAliasParameter( // Find the set of underlying pointers that this pointer is based on. SmallPtrSet basedOnPointers; - for (Value pointer : pointerArgs) - llvm::copy(getUnderlyingObjectSet(pointer), + for (Value pointer : pointerArgs) { + FailureOr> underlyingObjectSet = + getUnderlyingObjectSet(pointer); + if (failed(underlyingObjectSet)) + return; + llvm::copy(*underlyingObjectSet, std::inserter(basedOnPointers, basedOnPointers.begin())); + } bool aliasesOtherKnownObject = false; // Go through the based on pointers and check that they are either: @@ -850,6 +815,8 @@ struct LLVMInlinerInterface : public DialectInlinerInterface { } // end anonymous namespace -void LLVM::detail::addLLVMInlinerInterface(LLVM::LLVMDialect *dialect) { - dialect->addInterfaces(); +void mlir::LLVM::registerInlinerInterface(DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, LLVM::LLVMDialect *dialect) { + dialect->addInterfaces(); + }); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index e73df61c964341..9f1b6fdc55df3b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" +#include "mlir/Dialect/Tensor/Transforms/Transforms.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Matchers.h" @@ -2144,6 +2145,7 @@ struct LinalgElementwiseOpFusionPass // Add elementwise op fusion patterns. populateElementwiseOpsFusionPatterns(patterns, defaultControlFn); populateFoldReshapeOpsByExpansionPatterns(patterns, defaultControlFn); + tensor::populateBubbleUpExpandShapePatterns(patterns); // General canonicalization patterns. affine::AffineApplyOp::getCanonicalizationPatterns(patterns, context); diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp index 88d56a8fbec749..a45b79194a7580 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @@ -234,6 +234,46 @@ struct ConvertMemRefAssumeAlignment final } }; +//===----------------------------------------------------------------------===// +// ConvertMemRefCopy +//===----------------------------------------------------------------------===// + +struct ConvertMemRefCopy final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::CopyOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto maybeRankedSource = dyn_cast(op.getSource().getType()); + auto maybeRankedDest = dyn_cast(op.getTarget().getType()); + if (maybeRankedSource && maybeRankedDest && + maybeRankedSource.getLayout() != maybeRankedDest.getLayout()) + return rewriter.notifyMatchFailure( + op, llvm::formatv("memref.copy emulation with distinct layouts ({0} " + "and {1}) is currently unimplemented", + maybeRankedSource.getLayout(), + maybeRankedDest.getLayout())); + rewriter.replaceOpWithNewOp(op, adaptor.getSource(), + adaptor.getTarget()); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ConvertMemRefDealloc +//===----------------------------------------------------------------------===// + +struct ConvertMemRefDealloc final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::DeallocOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, adaptor.getMemref()); + return success(); + } +}; + //===----------------------------------------------------------------------===// // ConvertMemRefLoad //===----------------------------------------------------------------------===// @@ -300,6 +340,30 @@ struct ConvertMemRefLoad final : OpConversionPattern { } }; +//===----------------------------------------------------------------------===// +// ConvertMemRefMemorySpaceCast +//===----------------------------------------------------------------------===// + +struct ConvertMemRefMemorySpaceCast final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::MemorySpaceCastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type newTy = getTypeConverter()->convertType(op.getDest().getType()); + if (!newTy) { + return rewriter.notifyMatchFailure( + op->getLoc(), llvm::formatv("failed to convert memref type: {0}", + op.getDest().getType())); + } + + rewriter.replaceOpWithNewOp(op, newTy, + adaptor.getSource()); + return success(); + } +}; + //===----------------------------------------------------------------------===// // ConvertMemRefReinterpretCast //===----------------------------------------------------------------------===// @@ -490,6 +554,28 @@ struct ConvertMemRefCollapseShape final } }; +/// Emulating a `memref.expand_shape` becomes a no-op after emulation given +/// that we flatten memrefs to a single dimension as part of the emulation and +/// the expansion would just have been undone. +struct ConvertMemRefExpandShape final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::ExpandShapeOp expandShapeOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value srcVal = adaptor.getSrc(); + auto newTy = dyn_cast(srcVal.getType()); + if (!newTy) + return failure(); + + if (newTy.getRank() != 1) + return failure(); + + rewriter.replaceOp(expandShapeOp, srcVal); + return success(); + } +}; } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -502,9 +588,10 @@ void memref::populateMemRefNarrowTypeEmulationPatterns( // Populate `memref.*` conversion patterns. patterns.add, - ConvertMemRefAllocation, - ConvertMemRefCollapseShape, ConvertMemRefLoad, - ConvertMemrefStore, ConvertMemRefAssumeAlignment, + ConvertMemRefAllocation, ConvertMemRefCopy, + ConvertMemRefDealloc, ConvertMemRefCollapseShape, + ConvertMemRefExpandShape, ConvertMemRefLoad, ConvertMemrefStore, + ConvertMemRefAssumeAlignment, ConvertMemRefMemorySpaceCast, ConvertMemRefSubview, ConvertMemRefReinterpretCast>( typeConverter, patterns.getContext()); memref::populateResolveExtractStridedMetadataPatterns(patterns); diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp index 585c5b73814219..a2049ba4a4924d 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp @@ -726,6 +726,41 @@ struct ExtractStridedMetadataOpCollapseShapeFolder } }; +/// Pattern to replace `extract_strided_metadata(expand_shape)` +/// with the results of computing the sizes and strides on the expanded shape +/// and dividing up dimensions into static and dynamic parts as needed. +struct ExtractStridedMetadataOpExpandShapeFolder + : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(memref::ExtractStridedMetadataOp op, + PatternRewriter &rewriter) const override { + auto expandShapeOp = op.getSource().getDefiningOp(); + if (!expandShapeOp) + return failure(); + + FailureOr stridedMetadata = + resolveReshapeStridedMetadata( + rewriter, expandShapeOp, getExpandedSizes, getExpandedStrides); + if (failed(stridedMetadata)) { + return rewriter.notifyMatchFailure( + op, "failed to resolve metadata in terms of source expand_shape op"); + } + + Location loc = expandShapeOp.getLoc(); + SmallVector results; + results.push_back(stridedMetadata->basePtr); + results.push_back(getValueOrCreateConstantIndexOp(rewriter, loc, + stridedMetadata->offset)); + results.append( + getValueOrCreateConstantIndexOp(rewriter, loc, stridedMetadata->sizes)); + results.append(getValueOrCreateConstantIndexOp(rewriter, loc, + stridedMetadata->strides)); + rewriter.replaceOp(op, results); + return success(); + } +}; + /// Replace `base, offset, sizes, strides = /// extract_strided_metadata(allocLikeOp)` /// @@ -1060,6 +1095,54 @@ class ExtractStridedMetadataOpCastFolder } }; +/// Replace `base, offset, sizes, strides = extract_strided_metadata( +/// memory_space_cast(src) to dstTy)` +/// with +/// ``` +/// oldBase, offset, sizes, strides = extract_strided_metadata(src) +/// destBaseTy = type(oldBase) with memory space from destTy +/// base = memory_space_cast(oldBase) to destBaseTy +/// ``` +/// +/// In other words, propagate metadata extraction accross memory space casts. +class ExtractStridedMetadataOpMemorySpaceCastFolder + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult + matchAndRewrite(memref::ExtractStridedMetadataOp extractStridedMetadataOp, + PatternRewriter &rewriter) const override { + Location loc = extractStridedMetadataOp.getLoc(); + Value source = extractStridedMetadataOp.getSource(); + auto memSpaceCastOp = source.getDefiningOp(); + if (!memSpaceCastOp) + return failure(); + auto newExtractStridedMetadata = + rewriter.create( + loc, memSpaceCastOp.getSource()); + SmallVector results(newExtractStridedMetadata.getResults()); + // As with most other strided metadata rewrite patterns, don't introduce + // a use of the base pointer where non existed. This needs to happen here, + // as opposed to in later dead-code elimination, because these patterns are + // sometimes used during dialect conversion (see EmulateNarrowType, for + // example), so adding spurious usages would cause a pre-legalization value + // to be live that would be dead had this pattern not run. + if (!extractStridedMetadataOp.getBaseBuffer().use_empty()) { + auto baseBuffer = results[0]; + auto baseBufferType = cast(baseBuffer.getType()); + MemRefType::Builder newTypeBuilder(baseBufferType); + newTypeBuilder.setMemorySpace( + memSpaceCastOp.getResult().getType().getMemorySpace()); + results[0] = rewriter.create( + loc, Type{newTypeBuilder}, baseBuffer); + } else { + results[0] = nullptr; + } + rewriter.replaceOp(extractStridedMetadataOp, results); + return success(); + } +}; + /// Replace `base, offset = /// extract_strided_metadata(extract_strided_metadata(src)#0)` /// With @@ -1099,11 +1182,13 @@ void memref::populateExpandStridedMetadataPatterns( ExtractStridedMetadataOpAllocFolder, ExtractStridedMetadataOpAllocFolder, ExtractStridedMetadataOpCollapseShapeFolder, + ExtractStridedMetadataOpExpandShapeFolder, ExtractStridedMetadataOpGetGlobalFolder, RewriteExtractAlignedPointerAsIndexOfViewLikeOp, ExtractStridedMetadataOpReinterpretCastFolder, ExtractStridedMetadataOpSubviewFolder, ExtractStridedMetadataOpCastFolder, + ExtractStridedMetadataOpMemorySpaceCastFolder, ExtractStridedMetadataOpExtractStridedMetadataFolder>( patterns.getContext()); } @@ -1113,11 +1198,13 @@ void memref::populateResolveExtractStridedMetadataPatterns( patterns.add, ExtractStridedMetadataOpAllocFolder, ExtractStridedMetadataOpCollapseShapeFolder, + ExtractStridedMetadataOpExpandShapeFolder, ExtractStridedMetadataOpGetGlobalFolder, ExtractStridedMetadataOpSubviewFolder, RewriteExtractAlignedPointerAsIndexOfViewLikeOp, ExtractStridedMetadataOpReinterpretCastFolder, ExtractStridedMetadataOpCastFolder, + ExtractStridedMetadataOpMemorySpaceCastFolder, ExtractStridedMetadataOpExtractStridedMetadataFolder>( patterns.getContext()); } diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp index be0d71866a095e..5edd7a02bc42b1 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp @@ -140,6 +140,76 @@ struct FoldPaddingExpandIntoInsert : public OpRewritePattern { return success(); } }; + +/// Pattern to bubble up a tensor.expand_shape op through a producer +/// tensor.collapse_shape op that has non intersecting reassociations. +struct BubbleUpExpandThroughParallelCollapse + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExpandShapeOp expandOp, + PatternRewriter &rewriter) const override { + auto collapseOp = + expandOp.getSrc().getDefiningOp(); + if (!collapseOp) + return failure(); + auto expandReInds = expandOp.getReassociationIndices(); + auto collapseReInds = collapseOp.getReassociationIndices(); + + // Reshapes are parallel to each other if none of the reassociation indices + // have greater than 1 index for both reshapes. + for (auto [expandReassociation, collapseReassociation] : + llvm::zip_equal(expandReInds, collapseReInds)) { + if (collapseReassociation.size() != 1 && expandReassociation.size() != 1) + return failure(); + } + + // Compute new reassociation indices and expanded/collaped shapes. + SmallVector newExpandReInds, newCollapseReInds; + Location loc = expandOp->getLoc(); + SmallVector collapseSizes = + tensor::getMixedSizes(rewriter, loc, collapseOp.getSrc()); + SmallVector expandSizes(getMixedValues( + expandOp.getStaticOutputShape(), expandOp.getOutputShape(), rewriter)); + SmallVector newExpandSizes; + int64_t index = 0, expandIndex = 0, collapseIndex = 0; + for (auto [idx, collapseReassociation] : llvm::enumerate(collapseReInds)) { + if (collapseReassociation.size() != 1) { + ReassociationIndices newCollapseReassociation; + for (size_t i = 0; i < collapseReassociation.size(); ++i) { + newCollapseReassociation.push_back(index); + newExpandReInds.push_back({index++}); + newExpandSizes.push_back(collapseSizes[collapseIndex++]); + } + newCollapseReInds.push_back(newCollapseReassociation); + expandIndex++; + continue; + } + ReassociationIndices newExpandReassociation; + auto expandReassociation = expandReInds[idx]; + for (size_t i = 0; i < expandReassociation.size(); ++i) { + newExpandReassociation.push_back(index); + newCollapseReInds.push_back({index++}); + newExpandSizes.push_back(expandSizes[expandIndex++]); + } + newExpandReInds.push_back(newExpandReassociation); + collapseIndex++; + } + + // Swap reshape order. + SmallVector dynamicSizes; + SmallVector staticSizes; + dispatchIndexOpFoldResults(newExpandSizes, dynamicSizes, staticSizes); + auto expandResultType = expandOp.getResultType().clone(staticSizes); + auto newExpand = rewriter.create( + loc, expandResultType, collapseOp.getSrc(), newExpandReInds, + newExpandSizes); + rewriter.replaceOpWithNewOp( + expandOp, newExpand.getResult(), newCollapseReInds); + return success(); + } +}; + } // namespace void mlir::tensor::populateReassociativeReshapeFoldingPatterns( @@ -152,3 +222,8 @@ void mlir::tensor::populateReassociativeReshapeFoldingPatterns( FoldPaddingExpandIntoInsert>( patterns.getContext()); } + +void mlir::tensor::populateBubbleUpExpandShapePatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 750261f3860479..13c26ca4900d42 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -962,6 +962,29 @@ LogicalResult tosa::TableOp::inferReturnTypeComponents( return success(); } +LogicalResult tosa::TableOp::verify() { + TensorType inputType = getInput().getType(); + TensorType outputType = getOutput().getType(); + + if (inputType.hasRank() && outputType.hasRank() && + inputType.getRank() != outputType.getRank()) + return emitOpError() + << "expected input tensor rank to equal result tensor rank"; + + auto inputDims = inputType.getShape(); + auto outputDims = outputType.getShape(); + for (auto it : llvm::enumerate(llvm::zip(inputDims, outputDims))) { + int64_t dim = it.index(); + auto [inputDim, outputDim] = it.value(); + if (!ShapedType::isDynamic(outputDim) && outputDim != inputDim) { + return emitOpError() << "dim(result, " << dim << ") = " << outputDim + << " doesn't match dim(input, " << dim + << ") = " << inputDim; + } + } + return success(); +} + LogicalResult tosa::TileOp::inferReturnTypeComponents( MLIRContext *context, ::std::optional location, TileOp::Adaptor adaptor, diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 7285ad65fb549e..2289fd1ff1364e 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -1689,6 +1689,9 @@ struct WarpOpScfForOp : public OpRewritePattern { } }); + if (llvm::is_contained(distTypes, Type{})) + return failure(); + SmallVector newRetIndices; WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, escapingValues.getArrayRef(), distTypes, diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index 7f59a378e03512..ccbaa3e9759975 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -979,15 +979,18 @@ struct ReorderElementwiseOpsOnBroadcast final if (!llvm::isa(op->getResults()[0].getType())) return failure(); if (!OpTrait::hasElementwiseMappableTraits(op)) + return rewriter.notifyMatchFailure( + op, "Op doesn't have ElementwiseMappableTraits"); + if (op->getNumOperands() == 0) return failure(); - if (op->getNumOperands() == 0 || - op->getResults()[0].getType() != op->getOperand(0).getType()) { - return failure(); - } - // Avoid operations that only accept vector types, since broadcast - // source might be scalar types. + if (op->getResults()[0].getType() != op->getOperand(0).getType()) + return rewriter.notifyMatchFailure(op, + "result and operand type mismatch"); if (isa(op)) { - return failure(); + return rewriter.notifyMatchFailure( + op, + "Op only accepts vector types - not supported as broadcast source " + "might be a scalar"); } // Get the type of the lhs operand diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp index 2a146f5efed307..c1ee6507763566 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -128,6 +128,23 @@ class ROCDLDialectLLVMIRTranslationInterface attrValueStream << "1," << value.getInt(); llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue); } + if (dialect->getWavesPerEuAttrHelper().getName() == attribute.getName()) { + auto func = dyn_cast(op); + if (!func) + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); + auto value = dyn_cast(attribute.getValue()); + if (!value) + return op->emitOpError(Twine(attribute.getName()) + + " must be an integer"); + + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvm::SmallString<8> llvmAttrValue; + llvm::raw_svector_ostream attrValueStream(llvmAttrValue); + attrValueStream << value.getInt(); + llvmFunc->addFnAttr("amdgpu-waves-per-eu", llvmAttrValue); + } if (dialect->getFlatWorkGroupSizeAttrHelper().getName() == attribute.getName()) { auto func = dyn_cast(op); @@ -160,6 +177,21 @@ class ROCDLDialectLLVMIRTranslationInterface llvmFunc->addFnAttr("uniform-work-group-size", value.getValue() ? "true" : "false"); } + if (dialect->getUnsafeFpAtomicsAttrHelper().getName() == + attribute.getName()) { + auto func = dyn_cast(op); + if (!func) + return op->emitOpError(Twine(attribute.getName()) + + " is only supported on `llvm.func` operations"); + auto value = dyn_cast(attribute.getValue()); + if (!value) + return op->emitOpError(Twine(attribute.getName()) + + " must be a boolean"); + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvmFunc->addFnAttr("amdgpu-unsafe-fp-atomics", + value.getValue() ? "true" : "false"); + } // Set reqd_work_group_size metadata if (dialect->getReqdWorkGroupSizeAttrHelper().getName() == attribute.getName()) { diff --git a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-le-specific.mlir b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-le-specific.mlir index 7233a8bfffa9db..47be1be30577d8 100644 --- a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-le-specific.mlir +++ b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-le-specific.mlir @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -// XFAIL: target=s390x-{{.*}} +// XFAIL: target={{(s390x|sparc.*)-.*}} module attributes { spirv.target_env = #spirv.target_env< diff --git a/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir b/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir index 0b8b60e963bb01..a91b991c5ed2b9 100644 --- a/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining-alias-scopes.mlir @@ -296,6 +296,60 @@ llvm.func @bar(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) { llvm.func @random() -> i1 +llvm.func @region_branch(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}) { + %0 = llvm.mlir.constant(5 : i64) : i32 + test.region_if %arg0: !llvm.ptr -> !llvm.ptr then { + ^bb0(%arg2: !llvm.ptr): + test.region_if_yield %arg0 : !llvm.ptr + } else { + ^bb0(%arg2: !llvm.ptr): + test.region_if_yield %arg0 : !llvm.ptr + } join { + ^bb0(%arg2: !llvm.ptr): + llvm.store %0, %arg2 : i32, !llvm.ptr + test.region_if_yield %arg0 : !llvm.ptr + } + llvm.return +} + +// CHECK-LABEL: llvm.func @region_branch_inlining +// CHECK: llvm.store +// CHECK-SAME: alias_scopes = [#[[$ARG0_SCOPE]]] +// CHECK-SAME: noalias_scopes = [#[[$ARG1_SCOPE]]] +llvm.func @region_branch_inlining(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) { + llvm.call @region_branch(%arg0, %arg2) : (!llvm.ptr, !llvm.ptr) -> () + llvm.return +} + +// ----- + +llvm.func @missing_region_branch(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}) { + %0 = llvm.mlir.constant(5 : i64) : i32 + "test.one_region_op"() ({ + ^bb0(%arg2: !llvm.ptr): + llvm.store %0, %arg2 : i32, !llvm.ptr + "test.terminator"() : () -> () + }) : () -> () + llvm.return +} + +// CHECK-LABEL: llvm.func @missing_region_branch_inlining +// CHECK: llvm.store +// CHECK-NOT: alias_scopes +// CHECK-NOT: noalias_scopes +llvm.func @missing_region_branch_inlining(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) { + llvm.call @missing_region_branch(%arg0, %arg2) : (!llvm.ptr, !llvm.ptr) -> () + llvm.return +} + +// ----- + +// CHECK-DAG: #[[DOMAIN:.*]] = #llvm.alias_scope_domain<{{.*}}> +// CHECK-DAG: #[[$ARG0_SCOPE:.*]] = #llvm.alias_scope +// CHECK-DAG: #[[$ARG1_SCOPE:.*]] = #llvm.alias_scope + +llvm.func @random() -> i1 + llvm.func @block_arg(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}) { %0 = llvm.mlir.constant(5 : i64) : i32 %1 = llvm.mlir.constant(1 : i64) : i64 diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir index a67237b5e4dd19..540da239fced08 100644 --- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir @@ -6,11 +6,13 @@ func.func @memref_i8() -> i8 { %c3 = arith.constant 3 : index %m = memref.alloc() : memref<4xi8, 1> %v = memref.load %m[%c3] : memref<4xi8, 1> + memref.dealloc %m : memref<4xi8, 1> return %v : i8 } // CHECK-LABEL: func @memref_i8() // CHECK: %[[M:.+]] = memref.alloc() : memref<4xi8, 1> // CHECK-NEXT: %[[V:.+]] = memref.load %[[M]][%{{.+}}] : memref<4xi8, 1> +// CHECK-NEXT: memref.dealloc %[[M]] // CHECK-NEXT: return %[[V]] // CHECK32-LABEL: func @memref_i8() @@ -21,6 +23,7 @@ func.func @memref_i8() -> i8 { // CHECK32: %[[CAST:.+]] = arith.index_cast %[[C24]] : index to i32 // CHECK32: %[[SHIFTRT:.+]] = arith.shrsi %[[V]], %[[CAST]] // CHECK32: %[[TRUNC:.+]] = arith.trunci %[[SHIFTRT]] : i32 to i8 +// CHECK32-NEXT: memref.dealloc %[[M]] // CHECK32-NEXT: return %[[TRUNC]] // ----- @@ -485,3 +488,68 @@ func.func @memref_collapse_shape_i4(%idx0 : index, %idx1 : index) -> i4 { // CHECK32-NOT: memref.collapse_shape // CHECK32: memref.load %[[ALLOC]][%{{.*}}] : memref<4096xi32> +// ----- + +func.func @memref_expand_shape_i4(%idx0 : index, %idx1 : index, %idx2 : index) -> i4 { + %arr = memref.alloc() : memref<256x128xi4> + %expand = memref.expand_shape %arr[[0, 1], [2]] output_shape [32, 8, 128] : memref<256x128xi4> into memref<32x8x128xi4> + %1 = memref.load %expand[%idx0, %idx1, %idx2] : memref<32x8x128xi4> + return %1 : i4 +} + +// CHECK-LABEL: func.func @memref_expand_shape_i4( +// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<16384xi8> +// CHECK-NOT: memref.expand_shape +// CHECK: memref.load %[[ALLOC]][%{{.*}}] : memref<16384xi8> + +// CHECK32-LABEL: func.func @memref_expand_shape_i4( +// CHECK32: %[[ALLOC:.*]] = memref.alloc() : memref<4096xi32> +// CHECK32-NOT: memref.expand_shape +// CHECK32: memref.load %[[ALLOC]][%{{.*}}] : memref<4096xi32> + +// ----- + +func.func @memref_memory_space_cast_i4(%arg0: memref<32x128xi4, 1>) -> memref<32x128xi4> { + %cast = memref.memory_space_cast %arg0 : memref<32x128xi4, 1> to memref<32x128xi4> + return %cast : memref<32x128xi4> +} + +// CHECK-LABEL: func.func @memref_memory_space_cast_i4( +// CHECK-SAME: %[[ARG0:.*]]: memref<2048xi8, 1> +// CHECK: %[[CAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<2048xi8, 1> to memref<2048xi8> +// CHECK: return %[[CAST]] + +// CHECK32-LABEL: func.func @memref_memory_space_cast_i4( +// CHECK32-SAME: %[[ARG0:.*]]: memref<512xi32, 1> +// CHECK32: %[[CAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<512xi32, 1> to memref<512xi32> +// CHECK32: return %[[CAST]] + +// ----- + +func.func @memref_copy_i4(%arg0: memref<32x128xi4, 1>, %arg1: memref<32x128xi4>) { + memref.copy %arg0, %arg1 : memref<32x128xi4, 1> to memref<32x128xi4> + return +} + +// CHECK-LABEL: func.func @memref_copy_i4( +// CHECK-SAME: %[[ARG0:.*]]: memref<2048xi8, 1>, %[[ARG1:.*]]: memref<2048xi8> +// CHECK: memref.copy %[[ARG0]], %[[ARG1]] +// CHECK: return + +// CHECK32-LABEL: func.func @memref_copy_i4( +// CHECK32-SAME: %[[ARG0:.*]]: memref<512xi32, 1>, %[[ARG1:.*]]: memref<512xi32> +// CHECK32: memref.copy %[[ARG0]], %[[ARG1]] +// CHECK32: return + +// ----- + +!colMajor = memref<8x8xi4, strided<[1, 8]>> +func.func @copy_distinct_layouts(%idx : index) -> i4 { + %c0 = arith.constant 0 : index + %arr = memref.alloc() : memref<8x8xi4> + %arr2 = memref.alloc() : !colMajor + // expected-error @+1 {{failed to legalize operation 'memref.copy' that was explicitly marked illegal}} + memref.copy %arr, %arr2 : memref<8x8xi4> to !colMajor + %ld = memref.load %arr2[%c0, %c0] : !colMajor + return %ld : i4 +} diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir index d884ade3195329..8aac802ba10ae9 100644 --- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir +++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir @@ -1553,3 +1553,41 @@ func.func @extract_strided_metadata_of_collapse_shape(%base: memref<5x4xf32>) // CHECK-DAG: %[[STEP:.*]] = arith.constant 1 : index // CHECK: %[[BASE:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata // CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZE]], %[[STEP]] : memref, index, index, index + +// ----- + +func.func @extract_strided_metadata_of_memory_space_cast(%base: memref<20xf32>) + -> (memref, index, index, index) { + + %memory_space_cast = memref.memory_space_cast %base : memref<20xf32> to memref<20xf32, 1> + + %base_buffer, %offset, %size, %stride = memref.extract_strided_metadata %memory_space_cast : + memref<20xf32, 1> -> memref, index, index, index + + return %base_buffer, %offset, %size, %stride : + memref, index, index, index +} + +// CHECK-LABEL: func @extract_strided_metadata_of_memory_space_cast +// CHECK-DAG: %[[OFFSET:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[SIZE:.*]] = arith.constant 20 : index +// CHECK-DAG: %[[STEP:.*]] = arith.constant 1 : index +// CHECK: %[[BASE:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata +// CHECK: %[[CAST:.*]] = memref.memory_space_cast %[[BASE]] +// CHECK: return %[[CAST]], %[[OFFSET]], %[[SIZE]], %[[STEP]] : memref, index, index, index + +// ----- + +func.func @extract_strided_metadata_of_memory_space_cast_no_base(%base: memref<20xf32>) + -> (index, index, index) { + + %memory_space_cast = memref.memory_space_cast %base : memref<20xf32> to memref<20xf32, 1> + + %base_buffer, %offset, %size, %stride = memref.extract_strided_metadata %memory_space_cast : + memref<20xf32, 1> -> memref, index, index, index + + return %offset, %size, %stride : index, index, index +} + +// CHECK-LABEL: func @extract_strided_metadata_of_memory_space_cast_no_base +// CHECK-NOT: memref.memory_space_cast diff --git a/mlir/test/Dialect/Tensor/bubble-reshapes.mlir b/mlir/test/Dialect/Tensor/bubble-reshapes.mlir new file mode 100644 index 00000000000000..cf6b12852bcd39 --- /dev/null +++ b/mlir/test/Dialect/Tensor/bubble-reshapes.mlir @@ -0,0 +1,47 @@ +// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns=test-expand-shape-bubbling %s | FileCheck %s + +func.func @bubble_parallel_reshapes(%arg0: tensor, %s0: index, %s1: index, %s2: index, %s3: index) -> tensor { + %collapse = tensor.collapse_shape %arg0 [[0], [1, 2], [3]] : tensor into tensor + %expand = tensor.expand_shape %collapse [[0], [1], [2, 3]] + output_shape [%s0, %s1, %s2, %s3] : tensor into tensor + return %expand : tensor +} +// CHECK: func @bubble_parallel_reshapes +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[S0:.+]]: index, %[[S1:.+]]: index, %[[S2:.+]]: index, %[[S3:.+]]: index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor +// CHECK-DAG: %[[DIM2:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2], [3, 4]] +// CHECK-SAME: output_shape [%[[S0]], %[[DIM1]], %[[DIM2]], %[[S2]], %[[S3]]] : tensor into tensor +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[EXPAND]] {{\[}}[0], [1, 2], [3], [4]] : tensor into tensor +// CHECK: return %[[COLLAPSE]] + +// ----- + +func.func @no_bubble_full_intersecting_reshapes(%arg0: tensor, %s0: index, %s1: index, %s2: index, %s3: index) -> tensor { + %collapse = tensor.collapse_shape %arg0 [[0], [1, 2], [3]] : tensor into tensor + %expand = tensor.expand_shape %collapse [[0], [1, 2], [3]] + output_shape [%s0, %s1, %s2, %s3] : tensor into tensor + return %expand : tensor +} +// CHECK: func @no_bubble_full_intersecting_reshapes +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2], [3]] +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0], [1, 2], [3]] +// CHECK: return %[[EXPAND]] + +// ----- + +func.func @no_bubble_partial_intersecting_reshapes(%arg0: tensor, %s0: index, %s1: index, %s2: index, %s3: index) -> tensor { + %collapse = tensor.collapse_shape %arg0 [[0, 1, 2], [3]] : tensor into tensor + %expand = tensor.expand_shape %collapse [[0, 1], [2, 3]] + output_shape [%s0, %s1, %s2, %s3] : tensor into tensor + return %expand : tensor +} +// CHECK: func @no_bubble_partial_intersecting_reshapes +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0, 1], [2, 3]] +// CHECK: return %[[EXPAND]] diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index fd7554ca30ff41..3ac473d3a9d8c0 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -562,3 +562,30 @@ func.func @test_large_constant_permutation() { %3 = tosa.transpose %2, %1 : (tensor, tensor<2xi32>) -> tensor return } + +// ----- + +// CHECK-LABEL: test_table_rank0_table +func.func @test_table_rank0_table(%arg0: tensor<64xi16>, %arg1: tensor) { + // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tensor, but got 'tensor'}} + %0 = tosa.table %arg0, %arg1 : (tensor<64xi16>, tensor) -> tensor<64xi16> + return +} + +// ----- + +// CHECK-LABEL: test_table_io_rank_mismatch +func.func @test_table_io_rank_mismatch(%arg0: tensor<64xi16>, %arg1: tensor<6xi16>) { + // expected-error@+1 {{'tosa.table' op expected input tensor rank to equal result tensor rank}} + %0 = tosa.table %arg0, %arg1 : (tensor<64xi16>, tensor<6xi16>) -> tensor<64x?xi16> + return +} + +// ----- + +// CHECK-LABEL: test_table_io_shape_mismatch +func.func @test_table_io_shape_mismatch(%arg0: tensor, %arg1: tensor<6xi16>) { + // expected-error@+1 {{'tosa.table' op dim(result, 1) = 15 doesn't match dim(input, 1) = 16}} + %0 = tosa.table %arg0, %arg1 : (tensor, tensor<6xi16>) -> tensor + return +} diff --git a/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir b/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir index e7863a9e8b7b78..dd2e98831a708a 100644 --- a/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir +++ b/mlir/test/Dialect/Vector/sink-vector-broadcast.mlir @@ -1,18 +1,35 @@ // RUN: mlir-opt %s -test-sink-vector-broadcast -split-input-file | FileCheck %s +//----------------------------------------------------------------------------- +// [Pattern: ReorderElementwiseOpsOnBroadcast] +//----------------------------------------------------------------------------- + // CHECK-LABEL: func.func @broadcast_scalar_with_bcast( // CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index) -> vector<1x4xindex> { // CHECK: %[[ADD:.*]] = arith.addi %[[ARG_0]], %[[ARG_1]] : index // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x4xindex> // CHECK: return %[[BCAST]] : vector<1x4xindex> -func.func @broadcast_scalar_with_bcast( %arg1: index, %arg2: index) -> vector<1x4xindex> { +func.func @broadcast_scalar_with_bcast(%arg1: index, %arg2: index) -> vector<1x4xindex> { %0 = vector.broadcast %arg1 : index to vector<1x4xindex> %1 = vector.broadcast %arg2 : index to vector<1x4xindex> %2 = arith.addi %0, %1 : vector<1x4xindex> return %2 : vector<1x4xindex> } +// CHECK-LABEL: func.func @broadcast_scalar_with_bcast_scalable( +// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index) -> vector<1x[4]xindex> { +// CHECK: %[[ADD:.*]] = arith.addi %[[ARG_0]], %[[ARG_1]] : index +// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x[4]xindex> +// CHECK: return %[[BCAST]] : vector<1x[4]xindex> + +func.func @broadcast_scalar_with_bcast_scalable(%arg1: index, %arg2: index) -> vector<1x[4]xindex> { + %0 = vector.broadcast %arg1 : index to vector<1x[4]xindex> + %1 = vector.broadcast %arg2 : index to vector<1x[4]xindex> + %2 = arith.addi %0, %1 : vector<1x[4]xindex> + return %2 : vector<1x[4]xindex> +} + // ----- // CHECK-LABEL: func.func @broadcast_scalar_with_bcast_and_splat( @@ -21,13 +38,26 @@ func.func @broadcast_scalar_with_bcast( %arg1: index, %arg2: index) -> vector<1x // CHECK: %[[ADD:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : index // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x4xindex> // CHECK: return %[[BCAST]] : vector<1x4xindex> -func.func @broadcast_scalar_with_bcast_and_splat( %arg1: index, %arg2: index) -> vector<1x4xindex> { +func.func @broadcast_scalar_with_bcast_and_splat(%arg1: index, %arg2: index) -> vector<1x4xindex> { %0 = vector.splat %arg1 : vector<1x4xindex> %1 = vector.broadcast %arg2 : index to vector<1x4xindex> %2 = arith.addi %0, %1 : vector<1x4xindex> return %2 : vector<1x4xindex> } +// CHECK-LABEL: func.func @broadcast_scalar_with_bcast_and_splat_scalable( +// CHECK-SAME: %[[ARG1:.*]]: index, +// CHECK-SAME: %[[ARG2:.*]]: index) -> vector<1x[4]xindex> { +// CHECK: %[[ADD:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : index +// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADD]] : index to vector<1x[4]xindex> +// CHECK: return %[[BCAST]] : vector<1x[4]xindex> +func.func @broadcast_scalar_with_bcast_and_splat_scalable(%arg1: index, %arg2: index) -> vector<1x[4]xindex> { + %0 = vector.splat %arg1 : vector<1x[4]xindex> + %1 = vector.broadcast %arg2 : index to vector<1x[4]xindex> + %2 = arith.addi %0, %1 : vector<1x[4]xindex> + return %2 : vector<1x[4]xindex> +} + // ----- // CHECK-LABEL: func.func @broadcast_vector( @@ -37,13 +67,27 @@ func.func @broadcast_scalar_with_bcast_and_splat( %arg1: index, %arg2: index) -> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADDF]] : vector<4xf32> to vector<3x4xf32> // CHECK: return %[[BCAST]] : vector<3x4xf32> -func.func @broadcast_vector( %arg1: vector<4xf32>, %arg2: vector<4xf32>) -> vector<3x4xf32> { +func.func @broadcast_vector(%arg1: vector<4xf32>, %arg2: vector<4xf32>) -> vector<3x4xf32> { %arg1_bcast = vector.broadcast %arg1 : vector<4xf32> to vector<3x4xf32> %arg2_bcast = vector.broadcast %arg2 : vector<4xf32> to vector<3x4xf32> %2 = arith.addf %arg1_bcast, %arg2_bcast : vector<3x4xf32> return %2 : vector<3x4xf32> } +// CHECK-LABEL: func.func @broadcast_vector_scalable( +// CHECK-SAME: %[[ARG_0:.*]]: vector<[4]xf32>, +// CHECK-SAME: %[[ARG_1:.*]]: vector<[4]xf32>) -> vector<3x[4]xf32> { +// CHECK: %[[ADDF:.*]] = arith.addf %[[ARG_0]], %[[ARG_1]] : vector<[4]xf32> +// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ADDF]] : vector<[4]xf32> to vector<3x[4]xf32> +// CHECK: return %[[BCAST]] : vector<3x[4]xf32> + +func.func @broadcast_vector_scalable(%arg1: vector<[4]xf32>, %arg2: vector<[4]xf32>) -> vector<3x[4]xf32> { + %arg1_bcast = vector.broadcast %arg1 : vector<[4]xf32> to vector<3x[4]xf32> + %arg2_bcast = vector.broadcast %arg2 : vector<[4]xf32> to vector<3x[4]xf32> + %2 = arith.addf %arg1_bcast, %arg2_bcast : vector<3x[4]xf32> + return %2 : vector<3x[4]xf32> +} + // ----- // CHECK-LABEL: func.func @broadcast_scalar_and_vec( @@ -53,13 +97,27 @@ func.func @broadcast_vector( %arg1: vector<4xf32>, %arg2: vector<4xf32>) -> vect // CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<4xindex> to vector<1x4xindex> // CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x4xindex> // CHECK: return %[[ADD]] : vector<1x4xindex> -func.func @broadcast_scalar_and_vec( %arg1: index, %arg2: vector<4xindex>) -> vector<1x4xindex> { +func.func @broadcast_scalar_and_vec(%arg1: index, %arg2: vector<4xindex>) -> vector<1x4xindex> { %0 = vector.splat %arg1 : vector<1x4xindex> %1 = vector.broadcast %arg2 : vector<4xindex> to vector<1x4xindex> %2 = arith.addi %0, %1 : vector<1x4xindex> return %2 : vector<1x4xindex> } +// CHECK-LABEL: func.func @broadcast_scalar_and_vec_scalable( +// CHECK-SAME: %[[ARG1:.*]]: index, +// CHECK-SAME: %[[ARG2:.*]]: vector<[4]xindex>) -> vector<1x[4]xindex> { +// CHECK: %[[SPLAT:.*]] = vector.splat %[[ARG1]] : vector<1x[4]xindex> +// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG2]] : vector<[4]xindex> to vector<1x[4]xindex> +// CHECK: %[[ADD:.*]] = arith.addi %[[SPLAT]], %[[BCAST]] : vector<1x[4]xindex> +// CHECK: return %[[ADD]] : vector<1x[4]xindex> +func.func @broadcast_scalar_and_vec_scalable(%arg1: index, %arg2: vector<[4]xindex>) -> vector<1x[4]xindex> { + %0 = vector.splat %arg1 : vector<1x[4]xindex> + %1 = vector.broadcast %arg2 : vector<[4]xindex> to vector<1x[4]xindex> + %2 = arith.addi %0, %1 : vector<1x[4]xindex> + return %2 : vector<1x[4]xindex> +} + // ----- // CHECK-LABEL: func.func @broadcast_vector_and_scalar( @@ -69,12 +127,25 @@ func.func @broadcast_scalar_and_vec( %arg1: index, %arg2: vector<4xindex>) -> ve // CHECK: %[[ADD:.*]] = arith.addi %[[BCAST]], %[[ARG_1]] : vector<4xi32> // CHECK: return %[[ADD]] : vector<4xi32> -func.func @broadcast_vector_and_scalar( %arg1: i32, %arg2: vector<4xi32>) -> vector<4xi32> { +func.func @broadcast_vector_and_scalar(%arg1: i32, %arg2: vector<4xi32>) -> vector<4xi32> { %arg1_bcast = vector.broadcast %arg1 : i32 to vector<4xi32> %2 = arith.addi %arg1_bcast, %arg2 : vector<4xi32> return %2 : vector<4xi32> } +// CHECK-LABEL: func.func @broadcast_vector_and_scalar_scalable( +// CHECK-SAME: %[[ARG_0:.*]]: i32, +// CHECK-SAME: %[[ARG_1:.*]]: vector<[4]xi32>) -> vector<[4]xi32> { +// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG_0]] : i32 to vector<[4]xi32> +// CHECK: %[[ADD:.*]] = arith.addi %[[BCAST]], %[[ARG_1]] : vector<[4]xi32> +// CHECK: return %[[ADD]] : vector<[4]xi32> + +func.func @broadcast_vector_and_scalar_scalable(%arg1: i32, %arg2: vector<[4]xi32>) -> vector<[4]xi32> { + %arg1_bcast = vector.broadcast %arg1 : i32 to vector<[4]xi32> + %2 = arith.addi %arg1_bcast, %arg2 : vector<[4]xi32> + return %2 : vector<[4]xi32> +} + // ----- #matmat_accesses = [ @@ -87,12 +158,12 @@ func.func @broadcast_vector_and_scalar( %arg1: i32, %arg2: vector<4xi32>) -> vec iterator_types = ["parallel", "parallel", "reduction"] } -// CHECK-LABEL: func.func @broadcast_not_elementwise() -> vector<2x2xf32> { -// CHECK-DAG: %[[VAL_0:.*]] = arith.constant dense<1.000000e+00> : vector<2x2xf32> -// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense<2.000000e+00> : vector<2x2xf32> -// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<3.000000e+00> : vector<2x2xf32> -// CHECK: %[[VAL_3:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %[[VAL_0]], %[[VAL_1]], %[[VAL_2]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32> -func.func @broadcast_not_elementwise() -> vector<2x2xf32> { +// CHECK-LABEL: func.func @negative_not_elementwise +// CHECK-DAG: %[[F1:.*]] = arith.constant dense<1.000000e+00> : vector<2x2xf32> +// CHECK-DAG: %[[F2:.*]] = arith.constant dense<2.000000e+00> : vector<2x2xf32> +// CHECK-DAG: %[[F3:.*]] = arith.constant dense<3.000000e+00> : vector<2x2xf32> +// CHECK: %[[RES:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %[[F1]], %[[F2]], %[[F3]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32> +func.func @negative_not_elementwise() -> vector<2x2xf32> { %f1 = arith.constant 1.0: f32 %f2 = arith.constant 2.0: f32 %f3 = arith.constant 3.0: f32 @@ -100,27 +171,39 @@ func.func @broadcast_not_elementwise() -> vector<2x2xf32> { %A = vector.broadcast %f1 : f32 to vector<2x2xf32> %B = vector.broadcast %f2 : f32 to vector<2x2xf32> %C = vector.broadcast %f3 : f32 to vector<2x2xf32> - %mm1 = vector.contract #matmat_trait %A, %B, %C + %res = vector.contract #matmat_trait %A, %B, %C : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32> - return %mm1 : vector<2x2xf32> + return %res : vector<2x2xf32> } -// CHECK-LABEL: func.func @dont_sink_cmp( +// ----- + +// The source and the result for arith.cmp have different types - not supported + +// CHECK-LABEL: func.func @negative_source_and_result_mismatch // CHECK: %[[BROADCAST:.+]] = vector.broadcast // CHECK: %[[RETURN:.+]] = arith.cmpf uno, %[[BROADCAST]], %[[BROADCAST]] // CHECK: return %[[RETURN]] -func.func @dont_sink_cmp(%arg0 : f32, %arg1 : vector<1xf32>) -> vector<1xi1> { +func.func @negative_source_and_result_mismatch(%arg0 : f32, %arg1 : vector<1xf32>) -> vector<1xi1> { %0 = vector.broadcast %arg0 : f32 to vector<1xf32> %1 = arith.cmpf uno, %0, %0 : vector<1xf32> return %1 : vector<1xi1> } -// CHECK-LABEL: func.func @dont_sink_fma( +// ----- + +// vector.fma only supports vectors - currently it's not possible to replace this with e.g.: +// %scalar_res = vector.fma %scalar_1, %scalar2 +// %vec_res = vector.broadcast %scalar_res +// +// TODO: It should be possible to support this case + +// CHECK-LABEL: func.func @negative_op_only_supports_vectors // CHECK: %[[BROADCAST:.+]] = vector.broadcast // CHECK: %[[RESULT:.+]] = vector.fma %[[BROADCAST]] // CHECK: return %[[RESULT]] -func.func @dont_sink_fma(%arg0 : f32) -> vector<1xf32> { +func.func @negative_op_only_supports_vectors(%arg0 : f32) -> vector<1xf32> { %0 = vector.broadcast %arg0 : f32 to vector<1xf32> %1 = vector.fma %0, %0, %0 : vector<1xf32> return %1 : vector<1xf32> diff --git a/mlir/test/Dialect/Vector/vector-reduce-to-contract.mlir b/mlir/test/Dialect/Vector/vector-reduce-to-contract.mlir index 23a44b7c03f8f4..c0dbea81df892a 100644 --- a/mlir/test/Dialect/Vector/vector-reduce-to-contract.mlir +++ b/mlir/test/Dialect/Vector/vector-reduce-to-contract.mlir @@ -246,8 +246,12 @@ func.func @contract_broadcast_would_have_no_reduction_dim_pair(%arg0 : vector<1x //===----------------------------------------------------------------------===// +// [Pattern: ReorderCastOpsOnBroadcast] +// // Reorder casting ops and vector ops. The casting ops have almost identical // pattern, so only arith.extsi op is tested. +// +// TODO: Potential duplication with sink-vector-broadcast.mlir //===----------------------------------------------------------------------===// // ----- @@ -272,6 +276,11 @@ func.func @broadcast_scalar_extsi(%a : i8) -> vector<2x4xi32> { // ----- +//===----------------------------------------------------------------------===// +// [Pattern: ReorderElementwiseOpsOnTranspose] +// +// TODO: Potential duplication with sink-vector-broadcast.mlir +//===----------------------------------------------------------------------===// func.func @transpose_extsi(%a : vector<4x2xi8>) -> vector<2x4xi32> { // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : vector<4x2xi8> to vector<4x2xi32> // CHECK: vector.transpose %[[EXT]], [1, 0] : vector<4x2xi32> to vector<2x4xi32> @@ -282,6 +291,7 @@ func.func @transpose_extsi(%a : vector<4x2xi8>) -> vector<2x4xi32> { //===----------------------------------------------------------------------===// // Reorder elementwise ops and vector ops. +// TODO: Potential duplication with sink-vector-broadcast.mlir //===----------------------------------------------------------------------===// // ----- diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index bf90c4a6ebb3c2..0544cef3e38281 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -620,6 +620,40 @@ func.func @vector_reduction(%laneid: index) -> (f32) { // ----- +// CHECK-PROP-LABEL: func @warp_distribute( +// CHECK-PROP-SAME: %[[ID:[a-zA-Z0-9]+]] +// CHECK-PROP-SAME: %[[SRC:[a-zA-Z0-9]+]] +// CHECK-PROP-SAME: %[[DEST:[a-zA-Z0-9]+]] +// CHECK-PROP: vector.warp_execute_on_lane_0(%[[ID]])[32] +// CHECK-PROP-NEXT: "some_def"() : () -> vector<4096xf32> +// CHECK-PROP-NEXT: %{{.*}} = vector.reduction +// CHECK-PROP: %[[DEF:.*]] = arith.divf %{{.*}}, %{{.*}} : vector<1xf32> +// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-PROP: scf.for +// CHECK-PROP: %{{.*}} = arith.subf %{{.*}}, %[[DEF]] : vector<1xf32> +func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<128xf32>){ + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %f0 = arith.constant 0.000000e+00 : f32 + vector.warp_execute_on_lane_0(%arg0)[32]{ + %cst_1 = arith.constant dense<2.621440e+05> : vector<1xf32> + %0 = "some_def"() : () -> (vector<4096xf32>) + %1 = vector.reduction , %0, %cst : vector<4096xf32> into f32 + %2 = vector.broadcast %1 : f32 to vector<1xf32> + %3 = arith.divf %2, %cst_1 : vector<1xf32> + scf.for %arg1 = %c0 to %c128 step %c1 { + %4 = vector.transfer_read %src[%arg1], %f0 {in_bounds = [true]} : memref<128xf32>, vector<1xf32> + %5 = arith.subf %4, %3 : vector<1xf32> + vector.transfer_write %5, %dest[%arg1] : vector<1xf32>, memref<128xf32> + } + } + return +} + +// ----- + func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref) { %c0 = arith.constant 0: index %f0 = arith.constant 0.0: f32 diff --git a/mlir/test/IR/elements-attr-interface.mlir b/mlir/test/IR/elements-attr-interface.mlir index 5234c81bd841e3..79283f1aae99a8 100644 --- a/mlir/test/IR/elements-attr-interface.mlir +++ b/mlir/test/IR/elements-attr-interface.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s -test-elements-attr-interface -verify-diagnostics // Parsing external resources does not work on big-endian platforms currently -// XFAIL: target=s390x-{{.*}} +// XFAIL: target={{(s390x|sparc.*)-.*}} // This test contains various `ElementsAttr` attributes, and tests the support // for iterating the values of these attributes using various native C++ types. diff --git a/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir index f8d082082117cb..98145bc35cba77 100644 --- a/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-le-specific.mlir @@ -1,7 +1,7 @@ // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s // Decoding the attribute does not work on big-endian platforms currently -// XFAIL: target=s390x-{{.*}} +// XFAIL: target={{(s390x|sparc.*)-.*}} // CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000] llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource : tensor<5xf32>) : !llvm.array<5 x f32> @@ -24,4 +24,4 @@ llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dens dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E" } } -#-} \ No newline at end of file +#-} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 78c3987fab648e..64bcb5bdb255db 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -62,6 +62,20 @@ llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl. llvm.return } +llvm.func @kernel_func_waves_per_eu() + attributes {rocdl.kernel, rocdl.waves_per_eu = 2 : i32} { + // CHECK-LABEL: amdgpu_kernel void @kernel_func_waves_per_eu() + // CHECK: #[[$KERNEL_WAVES_PER_EU_ATTR:[0-9]+]] + llvm.return +} + +llvm.func @kernel_func_unsafe_fp_atomics() + attributes {rocdl.kernel, rocdl.unsafe_fp_atomics = true} { + // CHECK-LABEL: amdgpu_kernel void @kernel_func_unsafe_fp_atomics() + // CHECK: #[[$KERNEL_UNSAFE_FP_ATOMICS_ATTR:[0-9]+]] + llvm.return +} + llvm.func @rocdl.lane_id() -> i32 { // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) @@ -521,3 +535,5 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 { // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" // CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" } // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2} +// CHECK-DAG: attributes #[[$KERNEL_WAVES_PER_EU_ATTR]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "uniform-work-group-size"="true" } +// CHECK-DAG: attributes #[[$KERNEL_UNSAFE_FP_ATOMICS_ATTR]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-unsafe-fp-atomics"="true" "uniform-work-group-size"="true" } diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index dcc314f36ae0a8..47a49465e8a7cd 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -923,6 +923,120 @@ func.func @speculate_ceildivsi_const( return } +func.func @no_speculate_divui_range( +// CHECK-LABEL: @no_speculate_divui_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: scf.for +// CHECK: arith.divui + %val = arith.divui %num, %denom : i8 + } + + return +} + +func.func @no_speculate_divsi_range( +// CHECK-LABEL: @no_speculate_divsi_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom0 = test.with_bounds {smax = -1: i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + %denom1 = test.with_bounds {smax = 127 : i8, smin = 0 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: scf.for +// CHECK-COUNT-2: arith.divsi + %val0 = arith.divsi %num, %denom0 : i8 + %val1 = arith.divsi %num, %denom1 : i8 + } + + return +} + +func.func @no_speculate_ceildivui_range( +// CHECK-LABEL: @no_speculate_ceildivui_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: scf.for +// CHECK: arith.ceildivui + %val = arith.ceildivui %num, %denom : i8 + } + + return +} + +func.func @no_speculate_ceildivsi_range( +// CHECK-LABEL: @no_speculate_ceildivsi_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom0 = test.with_bounds {smax = -1 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + %denom1 = test.with_bounds {smax = 127 : i8, smin = 0 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: scf.for +// CHECK-COUNT-2: arith.ceildivsi + %val0 = arith.ceildivsi %num, %denom0 : i8 + %val1 = arith.ceildivsi %num, %denom1 : i8 + } + + return +} + +func.func @speculate_divui_range( +// CHECK-LABEL: @speculate_divui_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 1 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: arith.divui +// CHECK: scf.for + %val = arith.divui %num, %denom : i8 + } + + return +} + +func.func @speculate_divsi_range( +// CHECK-LABEL: @speculate_divsi_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom0 = test.with_bounds {smax = 127 : i8, smin = 1 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + %denom1 = test.with_bounds {smax = -2 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK-COUNT-2: arith.divsi +// CHECK: scf.for + %val0 = arith.divsi %num, %denom0 : i8 + %val1 = arith.divsi %num, %denom1 : i8 + + } + + return +} + +func.func @speculate_ceildivui_range( +// CHECK-LABEL: @speculate_ceildivui_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 1 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK: arith.ceildivui +// CHECK: scf.for + %val = arith.ceildivui %num, %denom : i8 + } + + return +} + +func.func @speculate_ceildivsi_range( +// CHECK-LABEL: @speculate_ceildivsi_range( + %num: i8, %lb: index, %ub: index, %step: index) { + %denom0 = test.with_bounds {smax = 127 : i8, smin = 1 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + %denom1 = test.with_bounds {smax = -2 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8 + scf.for %i = %lb to %ub step %step { +// CHECK-COUNT-2: arith.ceildivsi +// CHECK: scf.for + %val0 = arith.ceildivsi %num, %denom0 : i8 + %val1 = arith.ceildivsi %num, %denom1 : i8 + + } + + return +} + // ----- func.func @speculate_static_pack_and_unpack(%source: tensor<128x256xf32>, diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp index ae4f77f5873e2b..34de600132f5de 100644 --- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp +++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp @@ -72,6 +72,11 @@ struct TestTensorTransforms llvm::cl::desc("Test folding of expand_shape/collapse_shape"), llvm::cl::init(false)}; + Option testBubbleUpExpandShapePatterns{ + *this, "test-expand-shape-bubbling", + llvm::cl::desc("Test folding of expand_shape/collapse_shape"), + llvm::cl::init(false)}; + Option testFoldIntoPackAndUnpack{ *this, "test-fold-into-pack-and-unpack", llvm::cl::desc("Test folding ops into tensor.pack and tensor.unpack"), @@ -102,6 +107,12 @@ static void applyReassociativeReshapeFoldingPatterns(Operation *rootOp) { (void)applyPatternsAndFoldGreedily(rootOp, std::move(patterns)); } +static void applyBubbleUpExpandShapePatterns(Operation *rootOp) { + RewritePatternSet patterns(rootOp->getContext()); + tensor::populateBubbleUpExpandShapePatterns(patterns); + (void)applyPatternsAndFoldGreedily(rootOp, std::move(patterns)); +} + static void applyFoldIntoPackAndUnpackPatterns(Operation *rootOp) { RewritePatternSet patterns(rootOp->getContext()); tensor::populateFoldIntoPackAndUnpackPatterns(patterns); @@ -386,6 +397,8 @@ void TestTensorTransforms::runOnOperation() { applyDropRedundantInsertSliceRankExpansionPatterns(rootOp); if (testReassociativeReshapeFolding) applyReassociativeReshapeFoldingPatterns(rootOp); + if (testBubbleUpExpandShapePatterns) + applyBubbleUpExpandShapePatterns(rootOp); if (testFoldIntoPackAndUnpack) applyFoldIntoPackAndUnpackPatterns(rootOp); if (testRewriteExtractSliceWithTiledCollapseShape) { diff --git a/mlir/utils/vscode/package.json b/mlir/utils/vscode/package.json index bd550e2b6e61e2..6d0f6f5c88adb8 100644 --- a/mlir/utils/vscode/package.json +++ b/mlir/utils/vscode/package.json @@ -2,7 +2,7 @@ "name": "vscode-mlir", "displayName": "MLIR", "description": "MLIR Language Extension", - "version": "0.0.11", + "version": "0.0.12", "publisher": "llvm-vs-code-extensions", "homepage": "https://mlir.llvm.org/", "icon": "icon.png", @@ -47,7 +47,7 @@ "@types/vscode": "~1.67.0", "@vscode/vsce": "^2.19.0", "clang-format": "^1.8.0", - "typescript": "^4.6.4", + "typescript": "^4.9.5", "vscode-test": "^1.3.0" }, "repository": { @@ -155,6 +155,11 @@ "type": "string", "description": "The file path of the mlir-lsp-server executable." }, + "mlir.mlir_additional_server_args": { + "scope": "resource", + "type": "array", + "description": "A list of additional arguments for mlir-lsp-server executable. E.g. --log=verbose." + }, "mlir.pdll_server_path": { "scope": "resource", "type": "string", @@ -165,6 +170,11 @@ "type": "array", "description": "A list of `pdll_compile_commands.yml` database files containing information about .pdll files processed by the server." }, + "mlir.pdll_additional_server_args": { + "scope": "resource", + "type": "array", + "description": "A list of additional arguments for pdll-lsp-server executable. E.g. --log=verbose." + }, "mlir.tablegen_server_path": { "scope": "resource", "type": "string", @@ -175,6 +185,11 @@ "type": "array", "description": "A list of `tablegen_compile_commands.yml` database files containing information about .td files processed by the server." }, + "mlir.tablegen_additional_server_args": { + "scope": "resource", + "type": "array", + "description": "A list of additional arguments for tblgen-lsp-server executable. E.g. --log=verbose." + }, "mlir.onSettingsChanged": { "type": "string", "default": "prompt", diff --git a/mlir/utils/vscode/src/mlirContext.ts b/mlir/utils/vscode/src/mlirContext.ts index c7b6de6322d27f..e12aa92522d083 100644 --- a/mlir/utils/vscode/src/mlirContext.ts +++ b/mlir/utils/vscode/src/mlirContext.ts @@ -176,6 +176,7 @@ export class MLIRContext implements vscode.Disposable { let configsToWatch: string[] = []; let filepathsToWatch: string[] = []; let additionalServerArgs: string[] = []; + additionalServerArgs = config.get(languageName + "_additional_server_args", null, []); // Initialize additional configurations for this server. if (languageName === 'pdll') { diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index a28eb0fb2977ea..23e0106c80a2c8 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -39,6 +39,11 @@ void free(void *Ptr); } // namespace ompx +extern "C" { +[[gnu::weak]] void *malloc(size_t Size); +[[gnu::weak]] void free(void *Ptr); +} + #pragma omp end declare target #endif diff --git a/offload/DeviceRTL/include/Types.h b/offload/DeviceRTL/include/Types.h index 2e12d9da0353b7..cd8f925a392a80 100644 --- a/offload/DeviceRTL/include/Types.h +++ b/offload/DeviceRTL/include/Types.h @@ -188,7 +188,7 @@ typedef enum omp_allocator_handle_t { omp_cgroup_mem_alloc = 6, omp_pteam_mem_alloc = 7, omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = ~(0U) + KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) } omp_allocator_handle_t; #define __PRAGMA(STR) _Pragma(#STR) diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index c24af9442d16e3..ce4a221bdb37dd 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +#include "Allocator.h" #include "Configuration.h" #include "Types.h" @@ -128,6 +129,33 @@ double omp_get_wtime(void) { return ompx::impl::getWTime(); } void *__llvm_omp_indirect_call_lookup(void *HstPtr) { return ompx::impl::indirectCallLookup(HstPtr); } + +void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { + switch (allocator) { + case omp_default_mem_alloc: + case omp_large_cap_mem_alloc: + case omp_const_mem_alloc: + case omp_high_bw_mem_alloc: + case omp_low_lat_mem_alloc: + return malloc(size); + default: + return nullptr; + } +} + +void omp_free(void *ptr, omp_allocator_handle_t allocator) { + switch (allocator) { + case omp_default_mem_alloc: + case omp_large_cap_mem_alloc: + case omp_const_mem_alloc: + case omp_high_bw_mem_alloc: + case omp_low_lat_mem_alloc: + free(ptr); + case omp_null_allocator: + default: + return; + } +} } ///} diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index a1e4fa2449d9a2..f43f2cedb431d0 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -53,12 +53,12 @@ namespace { extern "C" { #ifdef __AMDGPU__ -[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); } +[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } #else -[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size); +[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); [[gnu::weak, gnu::leaf]] void free(void *Ptr); #endif diff --git a/offload/test/api/omp_device_alloc.c b/offload/test/api/omp_device_alloc.c new file mode 100644 index 00000000000000..368c6cfe42949b --- /dev/null +++ b/offload/test/api/omp_device_alloc.c @@ -0,0 +1,20 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +#include +#include +#include + +int main() { +#pragma omp target teams num_teams(4) +#pragma omp parallel + { + int *ptr = (int *)omp_alloc(sizeof(int), omp_default_mem_alloc); + assert(ptr && "Ptr is (null)!"); + *ptr = 1; + assert(*ptr == 1 && "Ptr is not 1"); + omp_free(ptr, omp_default_mem_alloc); + } + + // CHECK: PASS + printf("PASS\n"); +} diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index ed002c8cf0f807..951c651f42f29d 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1496,6 +1496,14 @@ clause. Examples for both are given below. $ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c $ env ./shared +.. _libomptarget_device_allocator: + +Device Allocation +^^^^^^^^^^^^^^^^^ + +The device runtime supports basic runtime allocation via the ``omp_alloc`` +function. Currently, this allocates global memory for all default traits. Access +modifiers are currently not supported and return a null pointer. .. _libomptarget_device_debugging: diff --git a/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel index b9e0a2e153ac1e..13f6f815d39950 100644 --- a/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/utils/lit/tests/BUILD.bazel @@ -33,5 +33,8 @@ expand_template( "//llvm:not", ] + glob(["Inputs/**"]), ) - for src in glob(["*/*.py"]) + for src in glob( + ["*/*.py"], + exclude = ["Inputs/**"], + ) ] diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 51fd6003300c05..5a7109a75088ee 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -342,6 +342,7 @@ cc_library( "include/mlir/IR/PDLPatternMatch.h.inc", "include/mlir/Interfaces/CallInterfaces.h", "include/mlir/Interfaces/DataLayoutInterfaces.h", + "include/mlir/Interfaces/InferIntRangeInterface.h", "include/mlir/Interfaces/SideEffectInterfaces.h", ], hdrs = glob([ @@ -362,6 +363,7 @@ cc_library( ":BytecodeOpInterfaceIncGen", ":CallOpInterfacesIncGen", ":DataLayoutInterfacesIncGen", + ":InferIntRangeInterfaceIncGen", ":OpAsmInterfaceIncGen", ":RegionKindInterfaceIncGen", ":SideEffectInterfacesIncGen", @@ -5422,7 +5424,10 @@ cc_library( hdrs = glob(["include/mlir/Dialect/LLVMIR/Transforms/*.h"]), includes = ["include"], deps = [ + ":DataLayoutInterfaces", + ":Analysis", ":FuncDialect", + ":InliningUtils", ":IR", ":LLVMDialect", ":LLVMPassIncGen",