IR: Converts base IR operations to store OpSize sizes

NFC Finally converts the IR operations themselves to store the OpSize for the IR operation size and element sizes. This also finally, FINALLY, converts that remaining `_Constant` helper to stop using a size field that is specified in bits rather than bytes like all the other IR op handlers. That thing was so confusing and now it's gone.
FEX-Emu · Oct 29, 2024 · 82f936c · 82f936c
1 parent 493b952
commit 82f936c
Show file tree

Hide file tree

Showing 13 changed files with 275 additions and 239 deletions.
diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py
@@ -323,8 +323,8 @@ def print_ir_structs(defines):
     output_file.write("struct __attribute__((packed)) IROp_Header {\n")
     output_file.write("\tvoid* Data[0];\n")
     output_file.write("\tIROps Op;\n\n")
-    output_file.write("\tuint8_t Size;\n")
-    output_file.write("\tuint8_t ElementSize;\n")
+    output_file.write("\tIR::OpSize Size;\n")
+    output_file.write("\tIR::OpSize ElementSize;\n")
 
     output_file.write("\ttemplate<typename T>\n")
     output_file.write("\tT const* C() const { return reinterpret_cast<T const*>(Data); }\n")
@@ -630,20 +630,20 @@ def print_ir_allocator_helpers():
     output_file.write("\t\treturn IRPair<T>{Op, CreateNode(&Op->Header)};\n")
     output_file.write("\t}\n\n")
 
-    output_file.write("\tuint8_t GetOpSize(const OrderedNode *Op) const {\n")
+    output_file.write("\tIR::OpSize GetOpSize(const OrderedNode *Op) const {\n")
     output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
-    output_file.write("\t\treturn HeaderOp->Size;\n")
+    output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->Size);\n")
     output_file.write("\t}\n\n")
 
-    output_file.write("\tuint8_t GetOpElementSize(const OrderedNode *Op) const {\n")
+    output_file.write("\tIR::OpSize GetOpElementSize(const OrderedNode *Op) const {\n")
     output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
-    output_file.write("\t\treturn HeaderOp->ElementSize;\n")
+    output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->ElementSize);\n")
     output_file.write("\t}\n\n")
 
     output_file.write("\tuint8_t GetOpElements(const OrderedNode *Op) const {\n")
     output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n")
     output_file.write("\t\tLOGMAN_THROW_A_FMT(OpHasDest(Op), \"Op {} has no dest\\n\", GetName(HeaderOp->Op));\n")
-    output_file.write("\t\treturn HeaderOp->Size / HeaderOp->ElementSize;\n")
+    output_file.write("\t\treturn IR::OpSizeToSize(GetOpSize(Op)) / IR::OpSizeToSize(GetOpElementSize(Op));\n")
     output_file.write("\t}\n\n")
 
     output_file.write("\tbool OpHasDest(const OrderedNode *Op) const {\n")
@@ -728,11 +728,11 @@ def print_ir_allocator_helpers():
                 # We can only infer a size if we have arguments
                 if op.DestSize == None:
                     # We need to infer destination size
-                    output_file.write("\t\tuint8_t InferSize = 0;\n")
+                    output_file.write("\t\tIR::OpSize InferSize = OpSize::iUnsized;\n")
                     if len(op.Arguments) != 0:
                         for arg in op.Arguments:
                             if arg.IsSSA:
-                                output_file.write("\t\tuint8_t Size{} = GetOpSize({});\n".format(arg.Name, arg.Name))
+                                output_file.write("\t\tauto Size{} = GetOpSize({});\n".format(arg.Name, arg.Name))
                         for arg in op.Arguments:
                             if arg.IsSSA:
                                 output_file.write("\t\tInferSize = std::max(InferSize, Size{});\n".format(arg.Name))
@@ -745,7 +745,7 @@ def print_ir_allocator_helpers():
                 output_file.write("\t\t_Op.first->Header.Size = {};\n".format(op.DestSize))
 
             if op.NumElements == None:
-                output_file.write("\t\t_Op.first->Header.ElementSize = _Op.first->Header.Size / ({});\n".format(1))
+                output_file.write("\t\t_Op.first->Header.ElementSize = _Op.first->Header.Size;\n")
             else:
                 output_file.write("\t\t_Op.first->Header.ElementSize = _Op.first->Header.Size / ({});\n".format(op.NumElements))
 

diff --git a/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp b/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp
@@ -13,7 +13,7 @@ namespace FEXCore::CPU {
 #define DEF_OP(x) void Arm64JITCore::Op_##x(IR::IROp_Header const* IROp, IR::NodeID Node)
 DEF_OP(CASPair) {
   auto Op = IROp->C<IR::IROp_CASPair>();
-  LOGMAN_THROW_AA_FMT(IROp->ElementSize == 4 || IROp->ElementSize == 8, "Wrong element size");
+  LOGMAN_THROW_AA_FMT(IROp->ElementSize == IR::OpSize::i32Bit || IROp->ElementSize == IR::OpSize::i64Bit, "Wrong element size");
   // Size is the size of each pair element
   auto Dst0 = GetReg(Op->OutLo.ID());
   auto Dst1 = GetReg(Op->OutHi.ID());
@@ -23,7 +23,7 @@ DEF_OP(CASPair) {
   auto Desired1 = GetReg(Op->DesiredHi.ID());
   auto MemSrc = GetReg(Op->Addr.ID());
 
-  const auto EmitSize = IROp->ElementSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
+  const auto EmitSize = IROp->ElementSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
   if (CTX->HostFeatures.SupportsAtomics) {
     // RA has heuristics to try to pair sources, but we need to handle the cases
     // where they fail. We do so by moving to temporaries. Note we use 64-bit
@@ -112,9 +112,9 @@ DEF_OP(CAS) {
     ARMEmitter::SingleUseForwardLabel LoopExpected;
     Bind(&LoopTop);
     ldaxr(SubEmitSize, TMP2, MemSrc);
-    if (IROp->Size == 1) {
+    if (IROp->Size == IR::OpSize::i8Bit) {
       cmp(EmitSize, TMP2, Expected, ARMEmitter::ExtendedType::UXTB, 0);
-    } else if (IROp->Size == 2) {
+    } else if (IROp->Size == IR::OpSize::i16Bit) {
       cmp(EmitSize, TMP2, Expected, ARMEmitter::ExtendedType::UXTH, 0);
     } else {
       cmp(EmitSize, TMP2, Expected);
@@ -273,18 +273,21 @@ DEF_OP(AtomicNeg) {
 
 DEF_OP(AtomicSwap) {
   auto Op = IROp->C<IR::IROp_AtomicSwap>();
-  uint8_t OpSize = IROp->Size;
-  LOGMAN_THROW_AA_FMT(OpSize == 8 || OpSize == 4 || OpSize == 2 || OpSize == 1, "Unexpected CAS size");
+  const auto OpSize = IROp->Size;
+  LOGMAN_THROW_AA_FMT(
+    OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i8Bit, "Unexpecte"
+                                                                                                                                 "d CAS "
+                                                                                                                                 "size");
 
   auto MemSrc = GetReg(Op->Addr.ID());
   auto Src = GetReg(Op->Value.ID());
 
   const auto EmitSize = ConvertSize(IROp);
-  const auto SubEmitSize = OpSize == 8 ? ARMEmitter::SubRegSize::i64Bit :
-                           OpSize == 4 ? ARMEmitter::SubRegSize::i32Bit :
-                           OpSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
-                           OpSize == 1 ? ARMEmitter::SubRegSize::i8Bit :
-                                         ARMEmitter::SubRegSize::i8Bit;
+  const auto SubEmitSize = OpSize == IR::OpSize::i64Bit ? ARMEmitter::SubRegSize::i64Bit :
+                           OpSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit :
+                           OpSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit :
+                           OpSize == IR::OpSize::i8Bit  ? ARMEmitter::SubRegSize::i8Bit :
+                                                          ARMEmitter::SubRegSize::i8Bit;
 
   if (CTX->HostFeatures.SupportsAtomics) {
     ldswpal(SubEmitSize, Src, GetReg(Node), MemSrc);
@@ -294,7 +297,7 @@ DEF_OP(AtomicSwap) {
     ldaxr(SubEmitSize, TMP2, MemSrc);
     stlxr(SubEmitSize, TMP4, Src, MemSrc);
     cbnz(EmitSize, TMP4, &LoopTop);
-    ubfm(EmitSize, GetReg(Node), TMP2, 0, OpSize * 8 - 1);
+    ubfm(EmitSize, GetReg(Node), TMP2, 0, IR::OpSizeAsBits(OpSize) - 1);
   }
 }
 

diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp
@@ -626,8 +626,8 @@ bool Arm64JITCore::IsInlineEntrypointOffset(const IR::OrderedNodeWrapper& WNode,
     auto Op = OpHeader->C<IR::IROp_InlineEntrypointOffset>();
     if (Value) {
       uint64_t Mask = ~0ULL;
-      uint8_t OpSize = OpHeader->Size;
-      if (OpSize == 4) {
+      const auto Size = OpHeader->Size;
+      if (Size == IR::OpSize::i32Bit) {
         Mask = 0xFFFF'FFFFULL;
       }
       *Value = (Entry + Op->Offset) & Mask;

diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h
@@ -129,23 +129,25 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter {
 
   [[nodiscard]]
   ARMEmitter::Size ConvertSize(const IR::IROp_Header* Op) {
-    return Op->Size == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
+    return Op->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
   }
 
   [[nodiscard]]
   ARMEmitter::Size ConvertSize48(const IR::IROp_Header* Op) {
-    LOGMAN_THROW_AA_FMT(Op->Size == 4 || Op->Size == 8, "Invalid size");
+    LOGMAN_THROW_AA_FMT(Op->Size == IR::OpSize::i32Bit || Op->Size == IR::OpSize::i64Bit, "Invalid size");
     return ConvertSize(Op);
   }
 
   [[nodiscard]]
-  ARMEmitter::SubRegSize ConvertSubRegSize16(uint8_t ElementSize) {
-    LOGMAN_THROW_AA_FMT(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || ElementSize == 8 || ElementSize == 16, "Invalid size");
-    return ElementSize == 1 ? ARMEmitter::SubRegSize::i8Bit :
-           ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
-           ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit :
-           ElementSize == 8 ? ARMEmitter::SubRegSize::i64Bit :
-                              ARMEmitter::SubRegSize::i128Bit;
+  ARMEmitter::SubRegSize ConvertSubRegSize16(IR::OpSize ElementSize) {
+    LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit ||
+                          ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit,
+                        "Invalid size");
+    return ElementSize == IR::OpSize::i8Bit  ? ARMEmitter::SubRegSize::i8Bit :
+           ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit :
+           ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit :
+           ElementSize == IR::OpSize::i64Bit ? ARMEmitter::SubRegSize::i64Bit :
+                                               ARMEmitter::SubRegSize::i128Bit;
   }
 
   [[nodiscard]]
@@ -154,8 +156,8 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter {
   }
 
   [[nodiscard]]
-  ARMEmitter::SubRegSize ConvertSubRegSize8(uint8_t ElementSize) {
-    LOGMAN_THROW_AA_FMT(ElementSize != 16, "Invalid size");
+  ARMEmitter::SubRegSize ConvertSubRegSize8(IR::OpSize ElementSize) {
+    LOGMAN_THROW_AA_FMT(ElementSize != IR::OpSize::i128Bit, "Invalid size");
     return ConvertSubRegSize16(ElementSize);
   }
 
@@ -166,13 +168,13 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter {
 
   [[nodiscard]]
   ARMEmitter::SubRegSize ConvertSubRegSize4(const IR::IROp_Header* Op) {
-    LOGMAN_THROW_AA_FMT(Op->ElementSize != 8, "Invalid size");
+    LOGMAN_THROW_AA_FMT(Op->ElementSize != IR::OpSize::i64Bit, "Invalid size");
     return ConvertSubRegSize8(Op);
   }
 
   [[nodiscard]]
   ARMEmitter::SubRegSize ConvertSubRegSize248(const IR::IROp_Header* Op) {
-    LOGMAN_THROW_AA_FMT(Op->ElementSize != 1, "Invalid size");
+    LOGMAN_THROW_AA_FMT(Op->ElementSize != IR::OpSize::i8Bit, "Invalid size");
     return ConvertSubRegSize8(Op);
   }
 
@@ -183,13 +185,13 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter {
 
   [[nodiscard]]
   ARMEmitter::VectorRegSizePair ConvertSubRegSizePair8(const IR::IROp_Header* Op) {
-    LOGMAN_THROW_AA_FMT(Op->ElementSize != 16, "Invalid size");
+    LOGMAN_THROW_AA_FMT(Op->ElementSize != IR::OpSize::i128Bit, "Invalid size");
     return ConvertSubRegSizePair16(Op);
   }
 
   [[nodiscard]]
   ARMEmitter::VectorRegSizePair ConvertSubRegSizePair248(const IR::IROp_Header* Op) {
-    LOGMAN_THROW_AA_FMT(Op->ElementSize != 1, "Invalid size");
+    LOGMAN_THROW_AA_FMT(Op->ElementSize != IR::OpSize::i8Bit, "Invalid size");
     return ConvertSubRegSizePair8(Op);
   }
 

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@@ -1388,7 +1388,7 @@ void OpDispatchBuilder::SHLImmediateOp(OpcodeArgs, bool SHL1Bit) {
   uint64_t Shift = LoadConstantShift(Op, SHL1Bit);
   const auto Size = GetSrcBitSize(Op);
 
-  Ref Src = _Constant(Size, Shift);
+  Ref Src = _Constant(OpSizeFromSrc(Op), Shift);
   Ref Result = _Lshl(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Src);
 
   CalculateFlags_ShiftLeftImmediate(OpSizeFromSrc(Op), Result, Dest, Shift);
@@ -1411,7 +1411,7 @@ void OpDispatchBuilder::SHRImmediateOp(OpcodeArgs, bool SHR1Bit) {
 
   uint64_t Shift = LoadConstantShift(Op, SHR1Bit);
 
-  Ref Src = _Constant(Size, Shift);
+  Ref Src = _Constant(OpSizeFromSrc(Op), Shift);
   auto ALUOp = _Lshr(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Src);
 
   CalculateFlags_ShiftRightImmediate(OpSizeFromSrc(Op), ALUOp, Dest, Shift);
@@ -1664,28 +1664,28 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) {
   auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
   auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
 
-  const auto Size = GetSrcSize(Op);
+  const auto Size = OpSizeFromSrc(Op);
   const auto SrcSize = Size * 8;
   const auto MaxSrcBit = SrcSize - 1;
-  auto MaxSrcBitOp = _Constant(SrcSize, MaxSrcBit);
+  auto MaxSrcBitOp = _Constant(Size, MaxSrcBit);
 
   // Shift the operand down to the starting bit
   auto Start = _Bfe(OpSizeFromSrc(Op), 8, 0, Src2);
-  auto Shifted = _Lshr(IR::SizeToOpSize(Size), Src1, Start);
+  auto Shifted = _Lshr(Size, Src1, Start);
 
   // Shifts larger than operand size need to be set to zero.
-  auto SanitizedShifted = _Select(IR::COND_ULE, Start, MaxSrcBitOp, Shifted, _Constant(SrcSize, 0));
+  auto SanitizedShifted = _Select(IR::COND_ULE, Start, MaxSrcBitOp, Shifted, _Constant(Size, 0));
 
   // Now handle the length specifier.
-  auto Length = _Bfe(OpSizeFromSrc(Op), 8, 8, Src2);
+  auto Length = _Bfe(Size, 8, 8, Src2);
 
   // Now build up the mask
   // (1 << Length) - 1 = ~(~0 << Length)
   auto AllOnes = _Constant(~0ull);
-  auto InvertedMask = _Lshl(IR::SizeToOpSize(Size), AllOnes, Length);
+  auto InvertedMask = _Lshl(Size, AllOnes, Length);
 
   // Now put it all together and make the result.
-  auto Masked = _Andn(IR::SizeToOpSize(Size), SanitizedShifted, InvertedMask);
+  auto Masked = _Andn(Size, SanitizedShifted, InvertedMask);
 
   // Sanitize the length. If it is above the max, we don't do the masking.
   auto Dest = _Select(IR::COND_ULE, Length, MaxSrcBitOp, Masked, SanitizedShifted);
@@ -1787,7 +1787,7 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) {
 }
 
 void OpDispatchBuilder::BZHI(OpcodeArgs) {
-  const auto Size = GetSrcSize(Op);
+  const auto Size = OpSizeFromSrc(Op);
   const auto OperandSize = Size * 8;
 
   // In 32-bit mode we only look at bottom 32-bit, no 8 or 16-bit BZHI so no
@@ -1799,9 +1799,9 @@ void OpDispatchBuilder::BZHI(OpcodeArgs) {
   // Clear the high bits specified by the index. A64 only considers bottom bits
   // of the shift, so we don't need to mask bottom 8-bits ourselves.
   // Out-of-bounds results ignored after.
-  auto NegOne = _Constant(OperandSize, -1);
-  auto Mask = _Lshl(IR::SizeToOpSize(Size), NegOne, Index);
-  auto MaskResult = _Andn(IR::SizeToOpSize(Size), Src, Mask);
+  auto NegOne = _Constant(Size, -1);
+  auto Mask = _Lshl(Size, NegOne, Index);
+  auto MaskResult = _Andn(Size, Src, Mask);
 
   // If the index is above OperandSize, we don't clear anything. BZHI only
   // considers the bottom 8-bits, so we really want to know if the bottom 8-bits
@@ -1810,7 +1810,7 @@ void OpDispatchBuilder::BZHI(OpcodeArgs) {
   // Because we're clobbering flags internally we ignore all carry invert
   // shenanigans and use the raw versions here.
   _TestNZ(OpSize::i64Bit, Index, _Constant(0xFF & ~(OperandSize - 1)));
-  auto Result = _NZCVSelect(IR::SizeToOpSize(Size), {COND_NEQ}, Src, MaskResult);
+  auto Result = _NZCVSelect(Size, {COND_NEQ}, Src, MaskResult);
   StoreResult(GPRClass, Op, Result, OpSize::iInvalid);
 
   auto Zero = _InlineConstant(0);
@@ -2065,7 +2065,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
     Ref Res = _Lshr(OpSize, Dest, Src);
     auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);
 
-    auto One = _Constant(Size, 1);
+    auto One = _Constant(OpSizeFromSrc(Op), 1);
 
     // Res |= (Dest << (Size - Shift + 1));
     // Expressed as Res | ((Src << (Size - Shift)) << 1) to get correct
@@ -2188,7 +2188,7 @@ void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {
     if (IsSrcConst) {
       SetCFDirect(Tmp, SrcConst - 1, true);
     } else {
-      auto One = _Constant(Size, 1);
+      auto One = _Constant(OpSizeFromSrc(Op), 1);
       auto NewCF = _Lshr(OpSize::i32Bit, Tmp, _Sub(OpSize::i32Bit, Src, One));
       SetCFDirect(NewCF, 0, true);
     }
@@ -2337,7 +2337,7 @@ void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {
 
     auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);
 
-    Ref Tmp = _Constant(64, 0);
+    Ref Tmp = _Constant(OpSize::i64Bit, 0);
 
     for (size_t i = 0; i < (32 + Size + 1); i += (Size + 1)) {
       // Insert incoming value
@@ -3084,7 +3084,7 @@ void OpDispatchBuilder::INCOp(OpcodeArgs) {
   Ref Dest;
   Ref Result;
   const auto Size = GetSrcBitSize(Op);
-  auto OneConst = _Constant(Size, 1);
+  auto OneConst = _Constant(OpSizeFromSrc(Op), 1);
 
   const bool IsLocked = DestIsLockedMem(Op);
 
@@ -3125,7 +3125,7 @@ void OpDispatchBuilder::DECOp(OpcodeArgs) {
   Ref Dest;
   Ref Result;
   const auto Size = GetSrcBitSize(Op);
-  auto OneConst = _Constant(Size, 1);
+  auto OneConst = _Constant(OpSizeFromSrc(Op), 1);
 
   const bool IsLocked = DestIsLockedMem(Op);
 
@@ -3135,7 +3135,7 @@ void OpDispatchBuilder::DECOp(OpcodeArgs) {
     Ref DestAddress = MakeSegmentAddress(Op, Op->Dest);
 
     // Use Add instead of Sub to avoid a NEG
-    Dest = _AtomicFetchAdd(OpSizeFromSrc(Op), _Constant(Size, -1), DestAddress);
+    Dest = _AtomicFetchAdd(OpSizeFromSrc(Op), _Constant(OpSizeFromSrc(Op), -1), DestAddress);
   } else {
     Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 32});
   }
@@ -3585,7 +3585,7 @@ void OpDispatchBuilder::POPFOp(OpcodeArgs) {
   // Bit 1 is always 1
   // Bit 9 is always 1 because we always have interrupts enabled
 
-  Src = _Or(OpSize::i64Bit, Src, _Constant(Size * 8, 0x202));
+  Src = _Or(OpSize::i64Bit, Src, _Constant(Size, 0x202));
 
   SetPackedRFLAG(false, Src);
 }

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@@ -2618,14 +2618,14 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, O
       // If the address element size if half the size of the Element load size then we need to start fetching half-way through the low register.
       AddrAddressing.Low = VSIB.Low;
       AddrAddressing.High = VSIB.High;
-      IndexElementOffset = OpSize::i128Bit / AddrElementSize / 2;
+      IndexElementOffset = IR::NumElements(OpSize::i128Bit, AddrElementSize) / 2;
     } else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
       AddrAddressing.Low = VSIB.High;
       AddrAddressing.High = Invalid();
       DestReg = Result.Low; ///< Start mixing with the low register.
       MaskReg = Mask.Low;   ///< Mask starts with the low mask here.
       IndexElementOffset = 0;
-      DataElementOffset = OpSize::i128Bit / ElementLoadSize / 2;
+      DataElementOffset = IR::NumElements(OpSize::i128Bit, ElementLoadSize) / 2;
     }
 
     ///< Calculate the high-half.