diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index 2555b54908..d6ef2058a6 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -632,12 +632,12 @@ def print_ir_allocator_helpers(): output_file.write("\tIR::OpSize GetOpSize(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") - output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->Size);\n") + output_file.write("\t\treturn HeaderOp->Size;\n") output_file.write("\t}\n\n") output_file.write("\tIR::OpSize GetOpElementSize(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") - output_file.write("\t\treturn IR::SizeToOpSize(HeaderOp->ElementSize);\n") + output_file.write("\t\treturn HeaderOp->ElementSize;\n") output_file.write("\t}\n\n") output_file.write("\tuint8_t GetOpElements(const OrderedNode *Op) const {\n") diff --git a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp index 56ef0662b7..b4223c90a8 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp +++ b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp @@ -79,7 +79,7 @@ void InterpreterOps::FillFallbackIndexPointers(uint64_t* Info) { } bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::IROp_Header* IROp, FallbackInfo* Info) { - uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; switch (IROp->Op) { case IR::OP_F80CVTTO: { auto Op = IROp->C(); @@ -99,11 +99,11 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I } case IR::OP_F80CVT: { switch (OpSize) { - case 4: { + case IR::OpSize::i32Bit: { *Info = {FABI_F32_I16_F80, (void*)&FEXCore::CPU::OpHandlers::handle4, Core::OPINDEX_F80CVT_4, SupportsPreserveAllABI}; return true; } - case 8: { + case IR::OpSize::i64Bit: { *Info = {FABI_F64_I16_F80, (void*)&FEXCore::CPU::OpHandlers::handle8, Core::OPINDEX_F80CVT_8, SupportsPreserveAllABI}; return true; } @@ -115,7 +115,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I auto Op = IROp->C(); switch (OpSize) { - case 2: { + case IR::OpSize::i16Bit: { if (Op->Truncate) { *Info = {FABI_I16_I16_F80, (void*)&FEXCore::CPU::OpHandlers::handle2t, Core::OPINDEX_F80CVTINT_TRUNC2, SupportsPreserveAllABI}; @@ -124,7 +124,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I } return true; } - case 4: { + case IR::OpSize::i32Bit: { if (Op->Truncate) { *Info = {FABI_I32_I16_F80, (void*)&FEXCore::CPU::OpHandlers::handle4t, Core::OPINDEX_F80CVTINT_TRUNC4, SupportsPreserveAllABI}; @@ -133,7 +133,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I } return true; } - case 8: { + case IR::OpSize::i64Bit: { if (Op->Truncate) { *Info = {FABI_I64_I16_F80, (void*)&FEXCore::CPU::OpHandlers::handle8t, Core::OPINDEX_F80CVTINT_TRUNC8, SupportsPreserveAllABI}; diff --git a/FEXCore/Source/Interface/Core/JIT/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/ALUOps.cpp index 13c90bd499..31ee941048 100644 --- a/FEXCore/Source/Interface/Core/JIT/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/ALUOps.cpp @@ -54,8 +54,8 @@ DEF_OP(EntrypointOffset) { auto Constant = Entry + Op->Offset; auto Dst = GetReg(Node); uint64_t Mask = ~0ULL; - uint8_t OpSize = IROp->Size; - if (OpSize == 4) { + const auto OpSize = IROp->Size; + if (OpSize == IR::OpSize::i32Bit) { Mask = 0xFFFF'FFFFULL; } @@ -92,10 +92,10 @@ DEF_OP(AddNZCV) { uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { - LOGMAN_THROW_AA_FMT(IROp->Size >= 4, "Constant not allowed here"); + LOGMAN_THROW_AA_FMT(IROp->Size >= IR::OpSize::i32Bit, "Constant not allowed here"); cmn(EmitSize, Src1, Const); - } else if (IROp->Size < 4) { - unsigned Shift = 32 - (8 * IROp->Size); + } else if (IROp->Size < IR::OpSize::i32Bit) { + unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size); lsl(ARMEmitter::Size::i32Bit, TMP1, Src1, Shift); cmn(EmitSize, TMP1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift); @@ -165,7 +165,7 @@ DEF_OP(TestNZ) { // Shift the sign bit into place, clearing out the garbage in upper bits. // Adding zero does an effective test, setting NZ according to the result and // zeroing CV. - if (IROp->Size < 4) { + if (IROp->Size < IR::OpSize::i32Bit) { // Cheaper to and+cmn than to lsl+lsl+tst, so do the and ourselves if // needed. if (Op->Src1 != Op->Src2) { @@ -179,7 +179,7 @@ DEF_OP(TestNZ) { Src1 = TMP1; } - unsigned Shift = 32 - (IROp->Size * 8); + unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size); cmn(EmitSize, ARMEmitter::Reg::zr, Src1, ARMEmitter::ShiftType::LSL, Shift); } else { if (IsInlineConstant(Op->Src2, &Const)) { @@ -193,11 +193,11 @@ DEF_OP(TestNZ) { DEF_OP(TestZ) { auto Op = IROp->C(); - LOGMAN_THROW_AA_FMT(IROp->Size < 4, "TestNZ used at higher sizes"); + LOGMAN_THROW_AA_FMT(IROp->Size < IR::OpSize::i32Bit, "TestNZ used at higher sizes"); const auto EmitSize = ARMEmitter::Size::i32Bit; uint64_t Const; - uint64_t Mask = IROp->Size == 8 ? ~0ULL : ((1ull << (IROp->Size * 8)) - 1); + uint64_t Mask = IROp->Size == IR::OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(IROp->Size)) - 1); auto Src1 = GetReg(Op->Src1.ID()); if (IsInlineConstant(Op->Src2, &Const)) { @@ -223,25 +223,25 @@ DEF_OP(SubShift) { DEF_OP(SubNZCV) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { - LOGMAN_THROW_AA_FMT(OpSize >= 4, "Constant not allowed here"); + LOGMAN_THROW_AA_FMT(OpSize >= IR::OpSize::i32Bit, "Constant not allowed here"); cmp(EmitSize, GetReg(Op->Src1.ID()), Const); } else { - unsigned Shift = OpSize < 4 ? (32 - (8 * OpSize)) : 0; + unsigned Shift = OpSize < IR::OpSize::i32Bit ? (32 - IR::OpSizeAsBits(OpSize)) : 0; ARMEmitter::Register ShiftedSrc1 = GetZeroableReg(Op->Src1); // Shift to fix flags for <32-bit ops. // Any shift of zero is still zero so optimize out silly zero shifts. - if (OpSize < 4 && ShiftedSrc1 != ARMEmitter::Reg::zr) { + if (OpSize < IR::OpSize::i32Bit && ShiftedSrc1 != ARMEmitter::Reg::zr) { lsl(ARMEmitter::Size::i32Bit, TMP1, ShiftedSrc1, Shift); ShiftedSrc1 = TMP1; } - if (OpSize < 4) { + if (OpSize < IR::OpSize::i32Bit) { cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID()), ARMEmitter::ShiftType::LSL, Shift); } else { cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2.ID())); @@ -286,10 +286,10 @@ DEF_OP(SetSmallNZV) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op"); - const uint8_t OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 1 || OpSize == 2, "Unsupported {} size: {}", __func__, OpSize); + const auto OpSize = IROp->Size; + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i8Bit || OpSize == IR::OpSize::i16Bit, "Unsupported {} size: {}", __func__, OpSize); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { setf8(GetReg(Op->Src.ID()).W()); } else { setf16(GetReg(Op->Src.ID()).W()); @@ -401,20 +401,20 @@ DEF_OP(Div) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); auto Src1 = GetReg(Op->Src1.ID()); auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { sxtb(EmitSize, TMP1, Src1); sxtb(EmitSize, TMP2, Src2); Src1 = TMP1; Src2 = TMP2; - } else if (OpSize == 2) { + } else if (OpSize == IR::OpSize::i16Bit) { sxth(EmitSize, TMP1, Src1); sxth(EmitSize, TMP2, Src2); @@ -430,20 +430,20 @@ DEF_OP(UDiv) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); auto Src1 = GetReg(Op->Src1.ID()); auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { uxtb(EmitSize, TMP1, Src1); uxtb(EmitSize, TMP2, Src2); Src1 = TMP1; Src2 = TMP2; - } else if (OpSize == 2) { + } else if (OpSize == IR::OpSize::i16Bit) { uxth(EmitSize, TMP1, Src1); uxth(EmitSize, TMP2, Src2); @@ -458,20 +458,20 @@ DEF_OP(Rem) { auto Op = IROp->C(); // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); auto Src1 = GetReg(Op->Src1.ID()); auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { sxtb(EmitSize, TMP1, Src1); sxtb(EmitSize, TMP2, Src2); Src1 = TMP1; Src2 = TMP2; - } else if (OpSize == 2) { + } else if (OpSize == IR::OpSize::i16Bit) { sxth(EmitSize, TMP1, Src1); sxth(EmitSize, TMP2, Src2); @@ -487,20 +487,20 @@ DEF_OP(URem) { auto Op = IROp->C(); // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); auto Src1 = GetReg(Op->Src1.ID()); auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { uxtb(EmitSize, TMP1, Src1); uxtb(EmitSize, TMP2, Src2); Src1 = TMP1; Src2 = TMP2; - } else if (OpSize == 2) { + } else if (OpSize == IR::OpSize::i16Bit) { uxth(EmitSize, TMP1, Src1); uxth(EmitSize, TMP2, Src2); @@ -514,15 +514,15 @@ DEF_OP(URem) { DEF_OP(MulH) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1.ID()); const auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 4) { + if (OpSize == IR::OpSize::i32Bit) { sxtw(TMP1, Src1.W()); sxtw(TMP2, Src2.W()); mul(ARMEmitter::Size::i32Bit, Dst, TMP1, TMP2); @@ -534,15 +534,15 @@ DEF_OP(MulH) { DEF_OP(UMulH) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1.ID()); const auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize == 4) { + if (OpSize == IR::OpSize::i32Bit) { uxtw(ARMEmitter::Size::i64Bit, TMP1, Src1); uxtw(ARMEmitter::Size::i64Bit, TMP2, Src2); mul(ARMEmitter::Size::i64Bit, Dst, TMP1, TMP2); @@ -593,7 +593,7 @@ DEF_OP(Ornror) { DEF_OP(AndWithFlags) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); uint64_t Const; @@ -601,7 +601,7 @@ DEF_OP(AndWithFlags) { auto Src1 = GetReg(Op->Src1.ID()); // See TestNZ - if (OpSize < 4) { + if (OpSize < IR::OpSize::i32Bit) { if (IsInlineConstant(Op->Src2, &Const)) { and_(EmitSize, Dst, Src1, Const); } else { @@ -614,7 +614,7 @@ DEF_OP(AndWithFlags) { } } - unsigned Shift = 32 - (OpSize * 8); + unsigned Shift = 32 - IR::OpSizeAsBits(OpSize); cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift); } else { if (IsInlineConstant(Op->Src2, &Const)) { @@ -648,21 +648,21 @@ DEF_OP(Ashr) { uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { - if (OpSize >= 4) { + if (OpSize >= IR::OpSize::i32Bit) { asr(EmitSize, Dst, Src1, (unsigned int)Const); } else { - sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8); + sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize)); asr(EmitSize, Dst, TMP1, (unsigned int)Const); - ubfx(EmitSize, Dst, Dst, 0, OpSize * 8); + ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize)); } } else { const auto Src2 = GetReg(Op->Src2.ID()); - if (OpSize >= 4) { + if (OpSize >= IR::OpSize::i32Bit) { asrv(EmitSize, Dst, Src1, Src2); } else { - sbfx(EmitSize, TMP1, Src1, 0, OpSize * 8); + sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize)); asrv(EmitSize, Dst, TMP1, Src2); - ubfx(EmitSize, Dst, Dst, 0, OpSize * 8); + ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize)); } } } @@ -897,7 +897,7 @@ DEF_OP(PDep) { DEF_OP(PExt) { auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto OpSizeBitsM1 = (OpSize * 8) - 1; + const auto OpSizeBitsM1 = IR::OpSizeAsBits(OpSize) - 1; const auto EmitSize = ConvertSize48(IROp); const auto Input = GetReg(Op->Input.ID()); @@ -952,8 +952,8 @@ DEF_OP(PExt) { DEF_OP(LDiv) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; - const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + const auto OpSize = IROp->Size; + const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Dst = GetReg(Node); const auto Upper = GetReg(Op->Upper.ID()); @@ -963,14 +963,14 @@ DEF_OP(LDiv) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 switch (OpSize) { - case 2: { + case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); sxth(EmitSize, TMP2, Divisor); sdiv(EmitSize, Dst, TMP1, TMP2); break; } - case 4: { + case IR::OpSize::i32Bit: { // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); @@ -978,7 +978,7 @@ DEF_OP(LDiv) { sdiv(EmitSize, Dst, TMP1, TMP2); break; } - case 8: { + case IR::OpSize::i64Bit: { ARMEmitter::SingleUseForwardLabel Only64Bit {}; ARMEmitter::SingleUseForwardLabel LongDIVRet {}; @@ -1022,8 +1022,8 @@ DEF_OP(LDiv) { DEF_OP(LUDiv) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; - const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + const auto OpSize = IROp->Size; + const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Dst = GetReg(Node); const auto Upper = GetReg(Op->Upper.ID()); @@ -1033,20 +1033,20 @@ DEF_OP(LUDiv) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64= switch (OpSize) { - case 2: { + case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); udiv(EmitSize, Dst, TMP1, Divisor); break; } - case 4: { + case IR::OpSize::i32Bit: { // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); udiv(EmitSize, Dst, TMP1, Divisor); break; } - case 8: { + case IR::OpSize::i64Bit: { ARMEmitter::SingleUseForwardLabel Only64Bit {}; ARMEmitter::SingleUseForwardLabel LongDIVRet {}; @@ -1086,8 +1086,8 @@ DEF_OP(LUDiv) { DEF_OP(LRem) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; - const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + const auto OpSize = IROp->Size; + const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Dst = GetReg(Node); const auto Upper = GetReg(Op->Upper.ID()); @@ -1097,7 +1097,7 @@ DEF_OP(LRem) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 switch (OpSize) { - case 2: { + case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); sxth(EmitSize, TMP2, Divisor); @@ -1105,7 +1105,7 @@ DEF_OP(LRem) { msub(EmitSize, Dst, TMP3, TMP2, TMP1); break; } - case 4: { + case IR::OpSize::i32Bit: { // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); @@ -1114,7 +1114,7 @@ DEF_OP(LRem) { msub(EmitSize, Dst, TMP2, TMP3, TMP1); break; } - case 8: { + case IR::OpSize::i64Bit: { ARMEmitter::SingleUseForwardLabel Only64Bit {}; ARMEmitter::SingleUseForwardLabel LongDIVRet {}; @@ -1160,8 +1160,8 @@ DEF_OP(LRem) { DEF_OP(LURem) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; - const auto EmitSize = OpSize >= 4 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + const auto OpSize = IROp->Size; + const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Dst = GetReg(Node); const auto Upper = GetReg(Op->Upper.ID()); @@ -1171,14 +1171,14 @@ DEF_OP(LURem) { // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 switch (OpSize) { - case 2: { + case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); udiv(EmitSize, TMP2, TMP1, Divisor); msub(EmitSize, Dst, TMP2, Divisor, TMP1); break; } - case 4: { + case IR::OpSize::i32Bit: { // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); @@ -1186,7 +1186,7 @@ DEF_OP(LURem) { msub(EmitSize, Dst, TMP2, Divisor, TMP1); break; } - case 8: { + case IR::OpSize::i64Bit: { ARMEmitter::SingleUseForwardLabel Only64Bit {}; ARMEmitter::SingleUseForwardLabel LongDIVRet {}; @@ -1238,30 +1238,30 @@ DEF_OP(Not) { DEF_OP(Popcount) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src.ID()); switch (OpSize) { - case 0x1: + case IR::OpSize::i8Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); // only use lowest byte cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); break; - case 0x2: + case IR::OpSize::i16Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // only count two lowest bytes addp(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D(), VTMP1.D()); break; - case 0x4: + case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // fmov has zero extended, unused bytes are zero addv(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); break; - case 0x8: + case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // fmov has zero extended, unused bytes are zero @@ -1288,17 +1288,18 @@ DEF_OP(FindLSB) { DEF_OP(FindMSB) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, + "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src.ID()); - movz(ARMEmitter::Size::i64Bit, TMP1, OpSize * 8 - 1); + movz(ARMEmitter::Size::i64Bit, TMP1, IR::OpSizeAsBits(OpSize) - 1); - if (OpSize == 2) { + if (OpSize == IR::OpSize::i16Bit) { lsl(EmitSize, Dst, Src, 16); clz(EmitSize, Dst, Dst); } else { @@ -1310,9 +1311,10 @@ DEF_OP(FindMSB) { DEF_OP(FindTrailingZeroes) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, + "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); @@ -1320,7 +1322,7 @@ DEF_OP(FindTrailingZeroes) { rbit(EmitSize, Dst, Src); - if (OpSize == 2) { + if (OpSize == IR::OpSize::i16Bit) { // This orr does two things. First, if the (masked) source is zero, it // reverses to zero in the top so it forces clz to return 16. Second, it // ensures garbage in the upper bits of the source don't affect clz, because @@ -1334,15 +1336,16 @@ DEF_OP(FindTrailingZeroes) { DEF_OP(CountLeadingZeroes) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, + "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src.ID()); - if (OpSize == 2) { + if (OpSize == IR::OpSize::i16Bit) { // Expressing as lsl+orr+clz clears away any garbage in the upper bits // (alternatively could do uxth+clz+sub.. equal cost in total). lsl(EmitSize, Dst, Src, 16); @@ -1355,16 +1358,17 @@ DEF_OP(CountLeadingZeroes) { DEF_OP(Rev) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; - LOGMAN_THROW_AA_FMT(OpSize == 2 || OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, + "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src.ID()); rev(EmitSize, Dst, Src); - if (OpSize == 2) { + if (OpSize == IR::OpSize::i16Bit) { lsr(EmitSize, Dst, Dst, 16); } } @@ -1390,10 +1394,10 @@ DEF_OP(Bfi) { mov(EmitSize, TMP1, SrcDst); bfi(EmitSize, TMP1, Src, Op->lsb, Op->Width); - if (IROp->Size >= 4) { + if (IROp->Size >= IR::OpSize::i32Bit) { mov(EmitSize, Dst, TMP1.R()); } else { - ubfx(EmitSize, Dst, TMP1, 0, IROp->Size * 8); + ubfx(EmitSize, Dst, TMP1, 0, IR::OpSizeAsBits(IROp->Size)); } } } @@ -1424,7 +1428,7 @@ DEF_OP(Bfxil) { DEF_OP(Bfe) { auto Op = IROp->C(); - LOGMAN_THROW_AA_FMT(IROp->Size <= 8, "OpSize is too large for BFE: {}", IROp->Size); + LOGMAN_THROW_AA_FMT(IROp->Size <= IR::OpSize::i64Bit, "OpSize is too large for BFE: {}", IROp->Size); LOGMAN_THROW_AA_FMT(Op->Width != 0, "Invalid BFE width of 0"); const auto EmitSize = ConvertSize(IROp); @@ -1434,7 +1438,7 @@ DEF_OP(Bfe) { if (Op->lsb == 0 && Op->Width == 32) { mov(ARMEmitter::Size::i32Bit, Dst, Src); } else if (Op->lsb == 0 && Op->Width == 64) { - LOGMAN_THROW_AA_FMT(IROp->Size == 8, "Must be 64-bit wide register"); + LOGMAN_THROW_AA_FMT(IROp->Size == IR::OpSize::i64Bit, "Must be 64-bit wide register"); mov(ARMEmitter::Size::i64Bit, Dst, Src); } else { ubfx(EmitSize, Dst, Src, Op->lsb, Op->Width); @@ -1451,7 +1455,7 @@ DEF_OP(Sbfe) { DEF_OP(Select) { auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto CompareEmitSize = Op->CompareSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; @@ -1479,7 +1483,7 @@ DEF_OP(Select) { bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true); bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false); - uint64_t all_ones = OpSize == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; + uint64_t all_ones = OpSize == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; ARMEmitter::Register Dst = GetReg(Node); @@ -1508,7 +1512,7 @@ DEF_OP(NZCVSelect) { bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true); bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false); - uint64_t all_ones = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; + uint64_t all_ones = IROp->Size == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; ARMEmitter::Register Dst = GetReg(Node); @@ -1547,7 +1551,7 @@ DEF_OP(VExtractToGPR) { constexpr auto AVXRegBitSize = Core::CPUState::XMM_AVX_REG_SIZE * 8; constexpr auto SSERegBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8; - const auto ElementSizeBits = Op->Header.ElementSize * 8; + const auto ElementSizeBits = IR::OpSizeAsBits(Op->Header.ElementSize); const auto Offset = ElementSizeBits * Op->Index; const auto Is256Bit = Offset >= SSERegBitSize; @@ -1558,10 +1562,10 @@ DEF_OP(VExtractToGPR) { const auto PerformMove = [&](const ARMEmitter::VRegister reg, int index) { switch (OpSize) { - case 1: umov(Dst, Vector, index); break; - case 2: umov(Dst, Vector, index); break; - case 4: umov(Dst, Vector, index); break; - case 8: umov(Dst, Vector, index); break; + case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", OpSize); break; } }; @@ -1586,10 +1590,10 @@ DEF_OP(VExtractToGPR) { // upper half of the vector. const auto SanitizedIndex = [OpSize, Op] { switch (OpSize) { - case 1: return Op->Index - 16; - case 2: return Op->Index - 8; - case 4: return Op->Index - 4; - case 8: return Op->Index - 2; + case IR::OpSize::i8Bit: return Op->Index - 16; + case IR::OpSize::i16Bit: return Op->Index - 8; + case IR::OpSize::i32Bit: return Op->Index - 4; + case IR::OpSize::i64Bit: return Op->Index - 2; default: LOGMAN_MSG_A_FMT("Unhandled OpSize: {}", OpSize); return 0; } }(); diff --git a/FEXCore/Source/Interface/Core/JIT/ConversionOps.cpp b/FEXCore/Source/Interface/Core/JIT/ConversionOps.cpp index 41b25ee6ee..18dbf4e442 100644 --- a/FEXCore/Source/Interface/Core/JIT/ConversionOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/ConversionOps.cpp @@ -15,18 +15,18 @@ DEF_OP(VInsGPR) { const auto DestIdx = Op->DestIdx; const auto ElementSize = Op->Header.ElementSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize8(IROp); - const auto ElementsPer128Bit = 16 / ElementSize; + const auto ElementsPer128Bit = IR::NumElements(IR::OpSize::i128Bit, ElementSize); const auto Dst = GetVReg(Node); const auto DestVector = GetVReg(Op->DestVector.ID()); const auto Src = GetReg(Op->Src.ID()); if (HostSupportsSVE256 && Is256Bit) { - const auto ElementSizeBits = ElementSize * 8; + const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); const auto Offset = ElementSizeBits * DestIdx; const auto SSEBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8; @@ -90,16 +90,16 @@ DEF_OP(VCastFromGPR) { auto Src = GetReg(Op->Src.ID()); switch (Op->Header.ElementSize) { - case 1: + case IR::OpSize::i8Bit: uxtb(ARMEmitter::Size::i32Bit, TMP1, Src); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1); break; - case 2: + case IR::OpSize::i16Bit: uxth(ARMEmitter::Size::i32Bit, TMP1, Src); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1); break; - case 4: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break; - case 8: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break; + case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break; + case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break; default: LOGMAN_MSG_A_FMT("Unknown castGPR element size: {}", Op->Header.ElementSize); } } @@ -111,7 +111,7 @@ DEF_OP(VDupFromGPR) { const auto Dst = GetVReg(Node); const auto Src = GetReg(Op->Src.ID()); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize8(IROp); @@ -126,7 +126,7 @@ DEF_OP(VDupFromGPR) { DEF_OP(Float_FromGPR_S) { const auto Op = IROp->C(); - const uint16_t ElementSize = Op->Header.ElementSize; + const uint16_t ElementSize = IR::OpSizeToSize(Op->Header.ElementSize); const uint16_t Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto Dst = GetVReg(Node); @@ -165,7 +165,7 @@ DEF_OP(Float_FromGPR_S) { DEF_OP(Float_FToF) { auto Op = IROp->C(); - const uint16_t Conv = (Op->Header.ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); + const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto Dst = GetVReg(Node); auto Src = GetVReg(Op->Scalar.ID()); @@ -205,7 +205,7 @@ DEF_OP(Vector_SToF) { const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -215,15 +215,15 @@ DEF_OP(Vector_SToF) { scvtf(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize); } else { if (OpSize == ElementSize) { - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { scvtf(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D()); - } else if (ElementSize == 4) { + } else if (ElementSize == IR::OpSize::i32Bit) { scvtf(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S()); } else { scvtf(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H()); } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { scvtf(SubEmitSize, Dst.D(), Vector.D()); } else { scvtf(SubEmitSize, Dst.Q(), Vector.Q()); @@ -238,7 +238,7 @@ DEF_OP(Vector_FToZS) { const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -248,15 +248,15 @@ DEF_OP(Vector_FToZS) { fcvtzs(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize); } else { if (OpSize == ElementSize) { - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { fcvtzs(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D()); - } else if (ElementSize == 4) { + } else if (ElementSize == IR::OpSize::i32Bit) { fcvtzs(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S()); } else { fcvtzs(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H()); } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { fcvtzs(SubEmitSize, Dst.D(), Vector.D()); } else { fcvtzs(SubEmitSize, Dst.Q(), Vector.Q()); @@ -269,7 +269,7 @@ DEF_OP(Vector_FToS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize248(IROp); @@ -284,7 +284,7 @@ DEF_OP(Vector_FToS) { } else { const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { frinti(SubEmitSize, Dst.D(), Vector.D()); fcvtzs(SubEmitSize, Dst.D(), Dst.D()); } else { @@ -300,10 +300,10 @@ DEF_OP(Vector_FToF) { const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); + const auto Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); @@ -403,7 +403,7 @@ DEF_OP(Vector_FToI) { const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -427,15 +427,15 @@ DEF_OP(Vector_FToI) { // frinti having AdvSIMD, AdvSIMD scalar, and an SVE version), // we can't just use a lambda without some seriously ugly casting. // This is fairly self-contained otherwise. -#define ROUNDING_FN(name) \ - if (ElementSize == 2) { \ - name(Dst.H(), Vector.H()); \ - } else if (ElementSize == 4) { \ - name(Dst.S(), Vector.S()); \ - } else if (ElementSize == 8) { \ - name(Dst.D(), Vector.D()); \ - } else { \ - FEX_UNREACHABLE; \ +#define ROUNDING_FN(name) \ + if (ElementSize == IR::OpSize::i16Bit) { \ + name(Dst.H(), Vector.H()); \ + } else if (ElementSize == IR::OpSize::i32Bit) { \ + name(Dst.S(), Vector.S()); \ + } else if (ElementSize == IR::OpSize::i64Bit) { \ + name(Dst.D(), Vector.D()); \ + } else { \ + FEX_UNREACHABLE; \ } switch (Op->Round) { @@ -464,7 +464,7 @@ DEF_OP(Vector_F64ToI32) { const auto OpSize = IROp->Size; const auto Round = Op->Round; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); diff --git a/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp b/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp index 3b56941bef..7d9526cd52 100644 --- a/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp @@ -24,7 +24,7 @@ DEF_OP(VAESEnc) { const auto State = GetVReg(Op->State.ID()); const auto ZeroReg = GetVReg(Op->ZeroReg.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations."); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. @@ -49,7 +49,7 @@ DEF_OP(VAESEncLast) { const auto State = GetVReg(Op->State.ID()); const auto ZeroReg = GetVReg(Op->ZeroReg.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations."); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. @@ -72,7 +72,7 @@ DEF_OP(VAESDec) { const auto State = GetVReg(Op->State.ID()); const auto ZeroReg = GetVReg(Op->ZeroReg.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations."); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. @@ -97,7 +97,7 @@ DEF_OP(VAESDecLast) { const auto State = GetVReg(Op->State.ID()); const auto ZeroReg = GetVReg(Op->ZeroReg.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations."); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. @@ -193,7 +193,7 @@ DEF_OP(PCLMUL) { const auto Src1 = GetVReg(Op->Src1.ID()); const auto Src2 = GetVReg(Op->Src2.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Currently only supports 128-bit operations."); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); switch (Op->Selector) { case 0b00000000: pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), Src1.D(), Src2.D()); break; diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h index 638452b69e..ba3e24442d 100644 --- a/FEXCore/Source/Interface/Core/JIT/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/JITClass.h @@ -228,7 +228,7 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { bool IsGPR(IR::NodeID Node) const; [[nodiscard]] - ARMEmitter::ExtendedMemOperand GenerateMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, + ARMEmitter::ExtendedMemOperand GenerateMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale); // NOTE: Will use TMP1 as a way to encode immediates that happen to fall outside @@ -237,7 +237,7 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { // TMP1 is safe to use again once this memory operand is used with its // equivalent loads or stores that this was called for. [[nodiscard]] - ARMEmitter::SVEMemOperand GenerateSVEMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, + ARMEmitter::SVEMemOperand GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale); [[nodiscard]] @@ -318,15 +318,16 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { using ScalarFMAOpCaller = std::function; - void VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, + void VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend); using ScalarBinaryOpCaller = std::function; - void VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, + void VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2); using ScalarUnaryOpCaller = std::function SrcVar)>; - void VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, - ARMEmitter::VRegister Vector1, std::variant Vector2); + void VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, + ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, + std::variant Vector2); void Emulate128BitGather(IR::OpSize Size, IR::OpSize ElementSize, ARMEmitter::VRegister Dst, ARMEmitter::VRegister IncomingDst, std::optional BaseAddr, ARMEmitter::VRegister VectorIndexLow, diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index 244b45aca5..d9ce167ec8 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -24,22 +24,22 @@ DEF_OP(LoadContext) { auto Dst = GetReg(Node); switch (OpSize) { - case 1: ldrb(Dst, STATE, Op->Offset); break; - case 2: ldrh(Dst, STATE, Op->Offset); break; - case 4: ldr(Dst.W(), STATE, Op->Offset); break; - case 8: ldr(Dst.X(), STATE, Op->Offset); break; + case IR::OpSize::i8Bit: ldrb(Dst, STATE, Op->Offset); break; + case IR::OpSize::i16Bit: ldrh(Dst, STATE, Op->Offset); break; + case IR::OpSize::i32Bit: ldr(Dst.W(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: ldr(Dst.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContext size: {}", OpSize); break; } } else { auto Dst = GetVReg(Node); switch (OpSize) { - case 1: ldrb(Dst, STATE, Op->Offset); break; - case 2: ldrh(Dst, STATE, Op->Offset); break; - case 4: ldr(Dst.S(), STATE, Op->Offset); break; - case 8: ldr(Dst.D(), STATE, Op->Offset); break; - case 16: ldr(Dst.Q(), STATE, Op->Offset); break; - case 32: + case IR::OpSize::i8Bit: ldrb(Dst, STATE, Op->Offset); break; + case IR::OpSize::i16Bit: ldrh(Dst, STATE, Op->Offset); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), STATE, Op->Offset); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), STATE, Op->Offset); break; + case IR::OpSize::i256Bit: mov(TMP1, Op->Offset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), STATE, TMP1); break; @@ -56,8 +56,8 @@ DEF_OP(LoadContextPair) { const auto Dst2 = GetReg(Op->OutValue2.ID()); switch (IROp->Size) { - case 4: ldp(Dst1.W(), Dst2.W(), STATE, Op->Offset); break; - case 8: ldp(Dst1.X(), Dst2.X(), STATE, Op->Offset); break; + case IR::OpSize::i32Bit: ldp(Dst1.W(), Dst2.W(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: ldp(Dst1.X(), Dst2.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } else { @@ -65,9 +65,9 @@ DEF_OP(LoadContextPair) { const auto Dst2 = GetVReg(Op->OutValue2.ID()); switch (IROp->Size) { - case 4: ldp(Dst1.S(), Dst2.S(), STATE, Op->Offset); break; - case 8: ldp(Dst1.D(), Dst2.D(), STATE, Op->Offset); break; - case 16: ldp(Dst1.Q(), Dst2.Q(), STATE, Op->Offset); break; + case IR::OpSize::i32Bit: ldp(Dst1.S(), Dst2.S(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: ldp(Dst1.D(), Dst2.D(), STATE, Op->Offset); break; + case IR::OpSize::i128Bit: ldp(Dst1.Q(), Dst2.Q(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } @@ -81,22 +81,22 @@ DEF_OP(StoreContext) { auto Src = GetZeroableReg(Op->Value); switch (OpSize) { - case 1: strb(Src, STATE, Op->Offset); break; - case 2: strh(Src, STATE, Op->Offset); break; - case 4: str(Src.W(), STATE, Op->Offset); break; - case 8: str(Src.X(), STATE, Op->Offset); break; + case IR::OpSize::i8Bit: strb(Src, STATE, Op->Offset); break; + case IR::OpSize::i16Bit: strh(Src, STATE, Op->Offset); break; + case IR::OpSize::i32Bit: str(Src.W(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: str(Src.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; } } else { const auto Src = GetVReg(Op->Value.ID()); switch (OpSize) { - case 1: strb(Src, STATE, Op->Offset); break; - case 2: strh(Src, STATE, Op->Offset); break; - case 4: str(Src.S(), STATE, Op->Offset); break; - case 8: str(Src.D(), STATE, Op->Offset); break; - case 16: str(Src.Q(), STATE, Op->Offset); break; - case 32: + case IR::OpSize::i8Bit: strb(Src, STATE, Op->Offset); break; + case IR::OpSize::i16Bit: strh(Src, STATE, Op->Offset); break; + case IR::OpSize::i32Bit: str(Src.S(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: str(Src.D(), STATE, Op->Offset); break; + case IR::OpSize::i128Bit: str(Src.Q(), STATE, Op->Offset); break; + case IR::OpSize::i256Bit: mov(TMP1, Op->Offset); st1b(Src.Z(), PRED_TMP_32B, STATE, TMP1); break; @@ -114,8 +114,8 @@ DEF_OP(StoreContextPair) { auto Src2 = GetZeroableReg(Op->Value2); switch (OpSize) { - case 4: stp(Src1.W(), Src2.W(), STATE, Op->Offset); break; - case 8: stp(Src1.X(), Src2.X(), STATE, Op->Offset); break; + case IR::OpSize::i32Bit: stp(Src1.W(), Src2.W(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: stp(Src1.X(), Src2.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; } } else { @@ -123,9 +123,9 @@ DEF_OP(StoreContextPair) { const auto Src2 = GetVReg(Op->Value2.ID()); switch (OpSize) { - case 4: stp(Src1.S(), Src2.S(), STATE, Op->Offset); break; - case 8: stp(Src1.D(), Src2.D(), STATE, Op->Offset); break; - case 16: stp(Src1.Q(), Src2.Q(), STATE, Op->Offset); break; + case IR::OpSize::i32Bit: stp(Src1.S(), Src2.S(), STATE, Op->Offset); break; + case IR::OpSize::i64Bit: stp(Src1.D(), Src2.D(), STATE, Op->Offset); break; + case IR::OpSize::i128Bit: stp(Src1.Q(), Src2.Q(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextPair size: {}", OpSize); break; } } @@ -142,7 +142,7 @@ DEF_OP(LoadRegister) { mov(GetReg(Node).X(), reg.X()); } } else if (Op->Class == IR::FPRClass) { - [[maybe_unused]] const auto regSize = HostSupportsAVX256 ? Core::CPUState::XMM_AVX_REG_SIZE : Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto regSize = HostSupportsAVX256 ? IR::OpSize::i256Bit : IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(Op->Reg < StaticFPRegisters.size(), "out of range reg"); LOGMAN_THROW_A_FMT(IROp->Size == regSize, "expected sized"); @@ -194,7 +194,7 @@ DEF_OP(StoreRegister) { mov(ARMEmitter::Size::i64Bit, reg, Src); } } else if (Op->Class == IR::FPRClass) { - [[maybe_unused]] const auto regSize = HostSupportsAVX256 ? Core::CPUState::XMM_AVX_REG_SIZE : Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto regSize = HostSupportsAVX256 ? IR::OpSize::i256Bit : IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(Op->Reg < StaticFPRegisters.size(), "reg out of range"); LOGMAN_THROW_A_FMT(IROp->Size == regSize, "expected sized"); @@ -250,10 +250,10 @@ DEF_OP(LoadContextIndexed) { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); const auto Dst = GetReg(Node); switch (OpSize) { - case 1: ldrb(Dst, TMP1, Op->BaseOffset); break; - case 2: ldrh(Dst, TMP1, Op->BaseOffset); break; - case 4: ldr(Dst.W(), TMP1, Op->BaseOffset); break; - case 8: ldr(Dst.X(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->BaseOffset); break; + case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->BaseOffset); break; + case IR::OpSize::i32Bit: ldr(Dst.W(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i64Bit: ldr(Dst.X(), TMP1, Op->BaseOffset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContextIndexed size: {}", OpSize); break; } break; @@ -273,11 +273,11 @@ DEF_OP(LoadContextIndexed) { const auto Dst = GetVReg(Node); switch (OpSize) { - case 1: ldrb(Dst, TMP1, Op->BaseOffset); break; - case 2: ldrh(Dst, TMP1, Op->BaseOffset); break; - case 4: ldr(Dst.S(), TMP1, Op->BaseOffset); break; - case 8: ldr(Dst.D(), TMP1, Op->BaseOffset); break; - case 16: + case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->BaseOffset); break; + case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->BaseOffset); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i128Bit: if (Op->BaseOffset % 16 == 0) { ldr(Dst.Q(), TMP1, Op->BaseOffset); } else { @@ -285,7 +285,7 @@ DEF_OP(LoadContextIndexed) { ldur(Dst.Q(), TMP1, Op->BaseOffset); } break; - case 32: + case IR::OpSize::i256Bit: mov(TMP2, Op->BaseOffset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, TMP2); break; @@ -315,10 +315,10 @@ DEF_OP(StoreContextIndexed) { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); switch (OpSize) { - case 1: strb(Value, TMP1, Op->BaseOffset); break; - case 2: strh(Value, TMP1, Op->BaseOffset); break; - case 4: str(Value.W(), TMP1, Op->BaseOffset); break; - case 8: str(Value.X(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i8Bit: strb(Value, TMP1, Op->BaseOffset); break; + case IR::OpSize::i16Bit: strh(Value, TMP1, Op->BaseOffset); break; + case IR::OpSize::i32Bit: str(Value.W(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i64Bit: str(Value.X(), TMP1, Op->BaseOffset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextIndexed size: {}", OpSize); break; } break; @@ -339,11 +339,11 @@ DEF_OP(StoreContextIndexed) { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); switch (OpSize) { - case 1: strb(Value, TMP1, Op->BaseOffset); break; - case 2: strh(Value, TMP1, Op->BaseOffset); break; - case 4: str(Value.S(), TMP1, Op->BaseOffset); break; - case 8: str(Value.D(), TMP1, Op->BaseOffset); break; - case 16: + case IR::OpSize::i8Bit: strb(Value, TMP1, Op->BaseOffset); break; + case IR::OpSize::i16Bit: strh(Value, TMP1, Op->BaseOffset); break; + case IR::OpSize::i32Bit: str(Value.S(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i64Bit: str(Value.D(), TMP1, Op->BaseOffset); break; + case IR::OpSize::i128Bit: if (Op->BaseOffset % 16 == 0) { str(Value.Q(), TMP1, Op->BaseOffset); } else { @@ -351,7 +351,7 @@ DEF_OP(StoreContextIndexed) { stur(Value.Q(), TMP1, Op->BaseOffset); } break; - case 32: + case IR::OpSize::i256Bit: mov(TMP2, Op->BaseOffset); st1b(Value.Z(), PRED_TMP_32B, TMP1, TMP2); break; @@ -366,13 +366,13 @@ DEF_OP(StoreContextIndexed) { DEF_OP(SpillRegister) { const auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const uint32_t SlotOffset = Op->Slot * MaxSpillSlotSize; if (Op->Class == FEXCore::IR::GPRClass) { const auto Src = GetReg(Op->Value.ID()); switch (OpSize) { - case 1: { + case IR::OpSize::i8Bit: { if (SlotOffset > LSByteMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); strb(Src, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -381,7 +381,7 @@ DEF_OP(SpillRegister) { } break; } - case 2: { + case IR::OpSize::i16Bit: { if (SlotOffset > LSHalfMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); strh(Src, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -390,7 +390,7 @@ DEF_OP(SpillRegister) { } break; } - case 4: { + case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.W(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -399,7 +399,7 @@ DEF_OP(SpillRegister) { } break; } - case 8: { + case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.X(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -414,7 +414,7 @@ DEF_OP(SpillRegister) { const auto Src = GetVReg(Op->Value.ID()); switch (OpSize) { - case 4: { + case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.S(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -423,7 +423,7 @@ DEF_OP(SpillRegister) { } break; } - case 8: { + case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.D(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -432,7 +432,7 @@ DEF_OP(SpillRegister) { } break; } - case 16: { + case IR::OpSize::i128Bit: { if (SlotOffset > LSQWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.Q(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -441,7 +441,7 @@ DEF_OP(SpillRegister) { } break; } - case 32: { + case IR::OpSize::i256Bit: { mov(TMP3, SlotOffset); st1b(Src.Z(), PRED_TMP_32B, ARMEmitter::Reg::rsp, TMP3); break; @@ -455,13 +455,13 @@ DEF_OP(SpillRegister) { DEF_OP(FillRegister) { const auto Op = IROp->C(); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; const uint32_t SlotOffset = Op->Slot * MaxSpillSlotSize; if (Op->Class == FEXCore::IR::GPRClass) { const auto Dst = GetReg(Node); switch (OpSize) { - case 1: { + case IR::OpSize::i8Bit: { if (SlotOffset > LSByteMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldrb(Dst, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -470,7 +470,7 @@ DEF_OP(FillRegister) { } break; } - case 2: { + case IR::OpSize::i16Bit: { if (SlotOffset > LSHalfMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldrh(Dst, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -479,7 +479,7 @@ DEF_OP(FillRegister) { } break; } - case 4: { + case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.W(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -488,7 +488,7 @@ DEF_OP(FillRegister) { } break; } - case 8: { + case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.X(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -503,7 +503,7 @@ DEF_OP(FillRegister) { const auto Dst = GetVReg(Node); switch (OpSize) { - case 4: { + case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.S(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -512,7 +512,7 @@ DEF_OP(FillRegister) { } break; } - case 8: { + case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.D(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -521,7 +521,7 @@ DEF_OP(FillRegister) { } break; } - case 16: { + case IR::OpSize::i128Bit: { if (SlotOffset > LSQWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.Q(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); @@ -530,7 +530,7 @@ DEF_OP(FillRegister) { } break; } - case 32: { + case IR::OpSize::i256Bit: { mov(TMP3, SlotOffset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), ARMEmitter::Reg::rsp, TMP3); break; @@ -563,11 +563,11 @@ DEF_OP(LoadDF) { } ARMEmitter::ExtendedMemOperand Arm64JITCore::GenerateMemOperand( - uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale) { + IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale) { if (Offset.IsInvalid()) { return ARMEmitter::ExtendedMemOperand(Base.X(), ARMEmitter::IndexType::OFFSET, 0); } else { - if (OffsetScale != 1 && OffsetScale != AccessSize) { + if (OffsetScale != 1 && OffsetScale != IR::OpSizeToSize(AccessSize)) { LOGMAN_MSG_A_FMT("Unhandled GenerateMemOperand OffsetScale: {}", OffsetScale); } uint64_t Const; @@ -590,7 +590,7 @@ ARMEmitter::ExtendedMemOperand Arm64JITCore::GenerateMemOperand( FEX_UNREACHABLE; } -ARMEmitter::SVEMemOperand Arm64JITCore::GenerateSVEMemOperand(uint8_t AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, +ARMEmitter::SVEMemOperand Arm64JITCore::GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, [[maybe_unused]] uint8_t OffsetScale) { if (Offset.IsInvalid()) { return ARMEmitter::SVEMemOperand(Base.X(), 0); @@ -652,22 +652,22 @@ DEF_OP(LoadMem) { const auto Dst = GetReg(Node); switch (OpSize) { - case 1: ldrb(Dst, MemSrc); break; - case 2: ldrh(Dst, MemSrc); break; - case 4: ldr(Dst.W(), MemSrc); break; - case 8: ldr(Dst.X(), MemSrc); break; + case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; + case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; + case IR::OpSize::i32Bit: ldr(Dst.W(), MemSrc); break; + case IR::OpSize::i64Bit: ldr(Dst.X(), MemSrc); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMem size: {}", OpSize); break; } } else { const auto Dst = GetVReg(Node); switch (OpSize) { - case 1: ldrb(Dst, MemSrc); break; - case 2: ldrh(Dst, MemSrc); break; - case 4: ldr(Dst.S(), MemSrc); break; - case 8: ldr(Dst.D(), MemSrc); break; - case 16: ldr(Dst.Q(), MemSrc); break; - case 32: { + case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; + case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), MemSrc); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), MemSrc); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), MemSrc); break; + case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Operand = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), Operand); @@ -687,8 +687,8 @@ DEF_OP(LoadMemPair) { const auto Dst2 = GetReg(Op->OutValue2.ID()); switch (IROp->Size) { - case 4: ldp(Dst1.W(), Dst2.W(), Addr, Op->Offset); break; - case 8: ldp(Dst1.X(), Dst2.X(), Addr, Op->Offset); break; + case IR::OpSize::i32Bit: ldp(Dst1.W(), Dst2.W(), Addr, Op->Offset); break; + case IR::OpSize::i64Bit: ldp(Dst1.X(), Dst2.X(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } else { @@ -696,9 +696,9 @@ DEF_OP(LoadMemPair) { const auto Dst2 = GetVReg(Op->OutValue2.ID()); switch (IROp->Size) { - case 4: ldp(Dst1.S(), Dst2.S(), Addr, Op->Offset); break; - case 8: ldp(Dst1.D(), Dst2.D(), Addr, Op->Offset); break; - case 16: ldp(Dst1.Q(), Dst2.Q(), Addr, Op->Offset); break; + case IR::OpSize::i32Bit: ldp(Dst1.S(), Dst2.S(), Addr, Op->Offset); break; + case IR::OpSize::i64Bit: ldp(Dst1.D(), Dst2.D(), Addr, Op->Offset); break; + case IR::OpSize::i128Bit: ldp(Dst1.Q(), Dst2.Q(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } @@ -723,15 +723,15 @@ DEF_OP(LoadMemTSO) { LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate"); } - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment const auto Dst = GetReg(Node); ldapurb(Dst, MemReg, Offset); } else { switch (OpSize) { - case 2: ldapurh(Dst, MemReg, Offset); break; - case 4: ldapur(Dst.W(), MemReg, Offset); break; - case 8: ldapur(Dst.X(), MemReg, Offset); break; + case IR::OpSize::i16Bit: ldapurh(Dst, MemReg, Offset); break; + case IR::OpSize::i32Bit: ldapur(Dst.W(), MemReg, Offset); break; + case IR::OpSize::i64Bit: ldapur(Dst.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. @@ -739,14 +739,14 @@ DEF_OP(LoadMemTSO) { } } else if (CTX->HostFeatures.SupportsRCPC && Op->Class == FEXCore::IR::GPRClass) { const auto Dst = GetReg(Node); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment ldaprb(Dst.W(), MemReg); } else { switch (OpSize) { - case 2: ldaprh(Dst.W(), MemReg); break; - case 4: ldapr(Dst.W(), MemReg); break; - case 8: ldapr(Dst.X(), MemReg); break; + case IR::OpSize::i16Bit: ldaprh(Dst.W(), MemReg); break; + case IR::OpSize::i32Bit: ldapr(Dst.W(), MemReg); break; + case IR::OpSize::i64Bit: ldapr(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. @@ -754,14 +754,14 @@ DEF_OP(LoadMemTSO) { } } else if (Op->Class == FEXCore::IR::GPRClass) { const auto Dst = GetReg(Node); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment ldarb(Dst, MemReg); } else { switch (OpSize) { - case 2: ldarh(Dst, MemReg); break; - case 4: ldar(Dst.W(), MemReg); break; - case 8: ldar(Dst.X(), MemReg); break; + case IR::OpSize::i16Bit: ldarh(Dst, MemReg); break; + case IR::OpSize::i32Bit: ldar(Dst.W(), MemReg); break; + case IR::OpSize::i64Bit: ldar(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. @@ -771,12 +771,12 @@ DEF_OP(LoadMemTSO) { const auto Dst = GetVReg(Node); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); switch (OpSize) { - case 1: ldrb(Dst, MemSrc); break; - case 2: ldrh(Dst, MemSrc); break; - case 4: ldr(Dst.S(), MemSrc); break; - case 8: ldr(Dst.D(), MemSrc); break; - case 16: ldr(Dst.Q(), MemSrc); break; - case 32: { + case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; + case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), MemSrc); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), MemSrc); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), MemSrc); break; + case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), MemSrc); @@ -796,7 +796,7 @@ DEF_OP(VLoadVectorMasked) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -814,38 +814,38 @@ DEF_OP(VLoadVectorMasked) { cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); switch (IROp->ElementSize) { - case 1: { + case IR::OpSize::i8Bit: { ld1b(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } - case 2: { + case IR::OpSize::i16Bit: { ld1h(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } - case 4: { + case IR::OpSize::i32Bit: { ld1w(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } - case 8: { + case IR::OpSize::i64Bit: { ld1d(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } default: break; } } else { - const auto PerformMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { + const auto PerformMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { - case 1: umov(Dst, Vector, index); break; - case 2: umov(Dst, Vector, index); break; - case 4: umov(Dst, Vector, index); break; - case 8: umov(Dst, Vector, index); break; + case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; // Prepare yourself adventurer. For a masked load without instructions that implement it. - LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Only supports 128-bit without SVE256"); - size_t NumElements = IROp->Size / IROp->ElementSize; + LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Only supports 128-bit without SVE256"); + size_t NumElements = IR::NumElements(IROp->Size, IROp->ElementSize); // Use VTMP1 as the temporary destination auto TempDst = VTMP1; @@ -854,7 +854,7 @@ DEF_OP(VLoadVectorMasked) { movi(ARMEmitter::SubRegSize::i64Bit, TempDst.Q(), 0); LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid(), "Complex addressing requested and not supported!"); - const uint64_t ElementSizeInBits = IROp->ElementSize * 8; + const uint64_t ElementSizeInBits = IR::OpSizeAsBits(IROp->ElementSize); for (size_t i = 0; i < NumElements; ++i) { // Extract the mask element. PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i); @@ -864,11 +864,11 @@ DEF_OP(VLoadVectorMasked) { tbz(WorkingReg, ElementSizeInBits - 1, &Skip); // Do the gather load for this element into the destination switch (IROp->ElementSize) { - case 1: ld1(TempDst.Q(), i, TempMemReg); break; - case 2: ld1(TempDst.Q(), i, TempMemReg); break; - case 4: ld1(TempDst.Q(), i, TempMemReg); break; - case 8: ld1(TempDst.Q(), i, TempMemReg); break; - case 16: ldr(TempDst.Q(), TempMemReg, 0); break; + case IR::OpSize::i8Bit: ld1(TempDst.Q(), i, TempMemReg); break; + case IR::OpSize::i16Bit: ld1(TempDst.Q(), i, TempMemReg); break; + case IR::OpSize::i32Bit: ld1(TempDst.Q(), i, TempMemReg); break; + case IR::OpSize::i64Bit: ld1(TempDst.Q(), i, TempMemReg); break; + case IR::OpSize::i128Bit: ldr(TempDst.Q(), TempMemReg, 0); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return; } @@ -878,7 +878,7 @@ DEF_OP(VLoadVectorMasked) { // Handle register rename to save a move. auto WorkingReg = TempMemReg; TempMemReg = TMP2; - add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IROp->ElementSize); + add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IR::OpSizeToSize(IROp->ElementSize)); } } @@ -891,7 +891,7 @@ DEF_OP(VStoreVectorMasked) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -908,45 +908,45 @@ DEF_OP(VStoreVectorMasked) { cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); switch (IROp->ElementSize) { - case 1: { + case IR::OpSize::i8Bit: { st1b(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 2: { + case IR::OpSize::i16Bit: { st1h(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 4: { + case IR::OpSize::i32Bit: { st1w(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 8: { + case IR::OpSize::i64Bit: { st1d(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } default: break; } } else { - const auto PerformMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { + const auto PerformMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { - case 1: umov(Dst, Vector, index); break; - case 2: umov(Dst, Vector, index); break; - case 4: umov(Dst, Vector, index); break; - case 8: umov(Dst, Vector, index); break; + case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; + case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; // Prepare yourself adventurer. For a masked store without instructions that implement it. - LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "Only supports 128-bit without SVE256"); - size_t NumElements = IROp->Size / IROp->ElementSize; + LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Only supports 128-bit without SVE256"); + size_t NumElements = IR::NumElements(IROp->Size, IROp->ElementSize); // Use VTMP1 as the temporary destination auto WorkingReg = TMP1; auto TempMemReg = MemReg; LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid(), "Complex addressing requested and not supported!"); - const uint64_t ElementSizeInBits = IROp->ElementSize * 8; + const uint64_t ElementSizeInBits = IR::OpSizeAsBits(IROp->ElementSize); for (size_t i = 0; i < NumElements; ++i) { // Extract the mask element. PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i); @@ -956,11 +956,11 @@ DEF_OP(VStoreVectorMasked) { tbz(WorkingReg, ElementSizeInBits - 1, &Skip); // Do the gather load for this element into the destination switch (IROp->ElementSize) { - case 1: st1(RegData.Q(), i, TempMemReg); break; - case 2: st1(RegData.Q(), i, TempMemReg); break; - case 4: st1(RegData.Q(), i, TempMemReg); break; - case 8: st1(RegData.Q(), i, TempMemReg); break; - case 16: str(RegData.Q(), TempMemReg, 0); break; + case IR::OpSize::i8Bit: st1(RegData.Q(), i, TempMemReg); break; + case IR::OpSize::i16Bit: st1(RegData.Q(), i, TempMemReg); break; + case IR::OpSize::i32Bit: st1(RegData.Q(), i, TempMemReg); break; + case IR::OpSize::i64Bit: st1(RegData.Q(), i, TempMemReg); break; + case IR::OpSize::i128Bit: str(RegData.Q(), TempMemReg, 0); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return; } @@ -970,7 +970,7 @@ DEF_OP(VStoreVectorMasked) { // Handle register rename to save a move. auto WorkingReg = TempMemReg; TempMemReg = TMP2; - add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IROp->ElementSize); + add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IR::OpSizeToSize(IROp->ElementSize)); } } } @@ -1109,7 +1109,7 @@ DEF_OP(VLoadVectorGatherMasked) { /// - When the behaviour doesn't match then it gets decomposed to ASIMD style masked load. /// - AddrBase also doesn't need to exist /// - If the instruction is using 64-bit vector indexing or 32-bit addresses where the top-bit isn't set then this is valid! - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1123,8 +1123,7 @@ DEF_OP(VLoadVectorGatherMasked) { ///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load. const bool SupportsSVELoad = (HostSupportsSVE128 || HostSupportsSVE256) && - (OffsetScale == 1 || OffsetScale == IR::OpSizeToSize(VectorIndexSize)) && - IR::OpSizeToSize(VectorIndexSize) == IROp->ElementSize; + (OffsetScale == 1 || OffsetScale == IR::OpSizeToSize(VectorIndexSize)) && VectorIndexSize == IROp->ElementSize; if (SupportsSVELoad) { uint8_t SVEScale = FEXCore::ilog2(OffsetScale); @@ -1158,19 +1157,19 @@ DEF_OP(VLoadVectorGatherMasked) { } switch (IROp->ElementSize) { - case 1: { + case IR::OpSize::i8Bit: { ld1b(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 2: { + case IR::OpSize::i16Bit: { ld1h(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 4: { + case IR::OpSize::i32Bit: { ld1w(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } - case 8: { + case IR::OpSize::i64Bit: { ld1d(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } @@ -1181,8 +1180,8 @@ DEF_OP(VLoadVectorGatherMasked) { sel(SubRegSize, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z()); } else { LOGMAN_THROW_A_FMT(!Is256Bit, "Can't emulate this gather load in the backend! Programming error!"); - Emulate128BitGather(IR::SizeToOpSize(IROp->Size), IR::SizeToOpSize(IROp->ElementSize), Dst, IncomingDst, BaseAddr, VectorIndexLow, - VectorIndexHigh, MaskReg, VectorIndexSize, DataElementOffsetStart, IndexElementOffsetStart, OffsetScale); + Emulate128BitGather(IROp->Size, IROp->ElementSize, Dst, IncomingDst, BaseAddr, VectorIndexLow, VectorIndexHigh, MaskReg, + VectorIndexSize, DataElementOffsetStart, IndexElementOffsetStart, OffsetScale); } } @@ -1269,28 +1268,30 @@ DEF_OP(VLoadVectorElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; const auto ElementSize = IROp->ElementSize; const auto Dst = GetVReg(Node); const auto DstSrc = GetVReg(Op->DstSrc.ID()); const auto MemReg = GetReg(Op->Addr.ID()); - LOGMAN_THROW_AA_FMT(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || ElementSize == 8 || ElementSize == 16, "Invalid element " - "size"); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || + ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, + "Invalid element " + "size"); if (Is256Bit) { LOGMAN_MSG_A_FMT("Unsupported 256-bit VLoadVectorElement"); } else { - if (Dst != DstSrc && ElementSize != 16) { + if (Dst != DstSrc && ElementSize != IR::OpSize::i128Bit) { mov(Dst.Q(), DstSrc.Q()); } switch (ElementSize) { - case 1: ld1(Dst.Q(), Op->Index, MemReg); break; - case 2: ld1(Dst.Q(), Op->Index, MemReg); break; - case 4: ld1(Dst.Q(), Op->Index, MemReg); break; - case 8: ld1(Dst.Q(), Op->Index, MemReg); break; - case 16: ldr(Dst.Q(), MemReg); break; + case IR::OpSize::i8Bit: ld1(Dst.Q(), Op->Index, MemReg); break; + case IR::OpSize::i16Bit: ld1(Dst.Q(), Op->Index, MemReg); break; + case IR::OpSize::i32Bit: ld1(Dst.Q(), Op->Index, MemReg); break; + case IR::OpSize::i64Bit: ld1(Dst.Q(), Op->Index, MemReg); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ElementSize); return; } } @@ -1305,14 +1306,16 @@ DEF_OP(VStoreVectorElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; const auto ElementSize = IROp->ElementSize; const auto Value = GetVReg(Op->Value.ID()); const auto MemReg = GetReg(Op->Addr.ID()); - LOGMAN_THROW_AA_FMT(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || ElementSize == 8 || ElementSize == 16, "Invalid element " - "size"); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || + ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, + "Invalid element " + "size"); // Emit a half-barrier if TSO is enabled. if (CTX->IsVectorAtomicTSOEnabled()) { @@ -1323,11 +1326,11 @@ DEF_OP(VStoreVectorElement) { LOGMAN_MSG_A_FMT("Unsupported 256-bit {}", __func__); } else { switch (ElementSize) { - case 1: st1(Value.Q(), Op->Index, MemReg); break; - case 2: st1(Value.Q(), Op->Index, MemReg); break; - case 4: st1(Value.Q(), Op->Index, MemReg); break; - case 8: st1(Value.Q(), Op->Index, MemReg); break; - case 16: str(Value.Q(), MemReg); break; + case IR::OpSize::i8Bit: st1(Value.Q(), Op->Index, MemReg); break; + case IR::OpSize::i16Bit: st1(Value.Q(), Op->Index, MemReg); break; + case IR::OpSize::i32Bit: st1(Value.Q(), Op->Index, MemReg); break; + case IR::OpSize::i64Bit: st1(Value.Q(), Op->Index, MemReg); break; + case IR::OpSize::i128Bit: str(Value.Q(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ElementSize); return; } } @@ -1337,34 +1340,36 @@ DEF_OP(VBroadcastFromMem) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = IROp->ElementSize; const auto Dst = GetVReg(Node); const auto MemReg = GetReg(Op->Address.ID()); - LOGMAN_THROW_AA_FMT(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || ElementSize == 8 || ElementSize == 16, "Invalid element " - "size"); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || + ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, + "Invalid element " + "size"); if (Is256Bit && HostSupportsSVE256) { const auto GoverningPredicate = PRED_TMP_32B.Zeroing(); switch (ElementSize) { - case 1: ld1rb(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), GoverningPredicate, MemReg); break; - case 2: ld1rh(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), GoverningPredicate, MemReg); break; - case 4: ld1rw(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), GoverningPredicate, MemReg); break; - case 8: ld1rd(Dst.Z(), GoverningPredicate, MemReg); break; - case 16: ld1rqb(Dst.Z(), GoverningPredicate, MemReg); break; + case IR::OpSize::i8Bit: ld1rb(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), GoverningPredicate, MemReg); break; + case IR::OpSize::i16Bit: ld1rh(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), GoverningPredicate, MemReg); break; + case IR::OpSize::i32Bit: ld1rw(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), GoverningPredicate, MemReg); break; + case IR::OpSize::i64Bit: ld1rd(Dst.Z(), GoverningPredicate, MemReg); break; + case IR::OpSize::i128Bit: ld1rqb(Dst.Z(), GoverningPredicate, MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled VBroadcastFromMem size: {}", ElementSize); return; } } else { switch (ElementSize) { - case 1: ld1r(Dst.Q(), MemReg); break; - case 2: ld1r(Dst.Q(), MemReg); break; - case 4: ld1r(Dst.Q(), MemReg); break; - case 8: ld1r(Dst.Q(), MemReg); break; - case 16: + case IR::OpSize::i8Bit: ld1r(Dst.Q(), MemReg); break; + case IR::OpSize::i16Bit: ld1r(Dst.Q(), MemReg); break; + case IR::OpSize::i32Bit: ld1r(Dst.Q(), MemReg); break; + case IR::OpSize::i64Bit: ld1r(Dst.Q(), MemReg); break; + case IR::OpSize::i128Bit: // Normal load, like ld1rqb with 128-bit regs. ldr(Dst.Q(), MemReg); break; @@ -1392,7 +1397,7 @@ DEF_OP(Push) { // Need to be careful here, incoming source might be reused afterwards. } else { // RA constraints would let this always be true. - mov(IROp->Size == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc); + mov(IROp->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc); } } @@ -1436,7 +1441,7 @@ DEF_OP(Push) { } } - sub(IROp->Size == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc, ValueSize); + sub(IROp->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc, ValueSize); } else { switch (ValueSize) { case 1: { @@ -1505,37 +1510,37 @@ DEF_OP(StoreMem) { if (Op->Class == FEXCore::IR::GPRClass) { const auto Src = GetReg(Op->Value.ID()); switch (OpSize) { - case 1: strb(Src, MemSrc); break; - case 2: strh(Src, MemSrc); break; - case 4: str(Src.W(), MemSrc); break; - case 8: str(Src.X(), MemSrc); break; + case IR::OpSize::i8Bit: strb(Src, MemSrc); break; + case IR::OpSize::i16Bit: strh(Src, MemSrc); break; + case IR::OpSize::i32Bit: str(Src.W(), MemSrc); break; + case IR::OpSize::i64Bit: str(Src.X(), MemSrc); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; } } else { const auto Src = GetVReg(Op->Value.ID()); switch (OpSize) { - case 1: { + case IR::OpSize::i8Bit: { strb(Src, MemSrc); break; } - case 2: { + case IR::OpSize::i16Bit: { strh(Src, MemSrc); break; } - case 4: { + case IR::OpSize::i32Bit: { str(Src.S(), MemSrc); break; } - case 8: { + case IR::OpSize::i64Bit: { str(Src.D(), MemSrc); break; } - case 16: { + case IR::OpSize::i128Bit: { str(Src.Q(), MemSrc); break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); st1b(Src.Z(), PRED_TMP_32B, MemSrc); @@ -1555,8 +1560,8 @@ DEF_OP(StoreMemPair) { const auto Src1 = GetReg(Op->Value1.ID()); const auto Src2 = GetReg(Op->Value2.ID()); switch (OpSize) { - case 4: stp(Src1.W(), Src2.W(), Addr, Op->Offset); break; - case 8: stp(Src1.X(), Src2.X(), Addr, Op->Offset); break; + case IR::OpSize::i32Bit: stp(Src1.W(), Src2.W(), Addr, Op->Offset); break; + case IR::OpSize::i64Bit: stp(Src1.X(), Src2.X(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; } } else { @@ -1564,9 +1569,9 @@ DEF_OP(StoreMemPair) { const auto Src2 = GetVReg(Op->Value2.ID()); switch (OpSize) { - case 4: stp(Src1.S(), Src2.S(), Addr, Op->Offset); break; - case 8: stp(Src1.D(), Src2.D(), Addr, Op->Offset); break; - case 16: stp(Src1.Q(), Src2.Q(), Addr, Op->Offset); break; + case IR::OpSize::i32Bit: stp(Src1.S(), Src2.S(), Addr, Op->Offset); break; + case IR::OpSize::i64Bit: stp(Src1.D(), Src2.D(), Addr, Op->Offset); break; + case IR::OpSize::i128Bit: stp(Src1.Q(), Src2.Q(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemPair size: {}", OpSize); break; } } @@ -1591,32 +1596,32 @@ DEF_OP(StoreMemTSO) { LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate"); } - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment stlurb(Src, MemReg, Offset); } else { // Half-barrier once back-patched. nop(); switch (OpSize) { - case 2: stlurh(Src, MemReg, Offset); break; - case 4: stlur(Src.W(), MemReg, Offset); break; - case 8: stlur(Src.X(), MemReg, Offset); break; + case IR::OpSize::i16Bit: stlurh(Src, MemReg, Offset); break; + case IR::OpSize::i32Bit: stlur(Src.W(), MemReg, Offset); break; + case IR::OpSize::i64Bit: stlur(Src.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemTSO size: {}", OpSize); break; } } } else if (Op->Class == FEXCore::IR::GPRClass) { const auto Src = GetReg(Op->Value.ID()); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment stlrb(Src, MemReg); } else { // Half-barrier once back-patched. nop(); switch (OpSize) { - case 2: stlrh(Src, MemReg); break; - case 4: stlr(Src.W(), MemReg); break; - case 8: stlr(Src.X(), MemReg); break; + case IR::OpSize::i16Bit: stlrh(Src, MemReg); break; + case IR::OpSize::i32Bit: stlr(Src.W(), MemReg); break; + case IR::OpSize::i64Bit: stlr(Src.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemTSO size: {}", OpSize); break; } } @@ -1628,12 +1633,12 @@ DEF_OP(StoreMemTSO) { const auto Src = GetVReg(Op->Value.ID()); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); switch (OpSize) { - case 1: strb(Src, MemSrc); break; - case 2: strh(Src, MemSrc); break; - case 4: str(Src.S(), MemSrc); break; - case 8: str(Src.D(), MemSrc); break; - case 16: str(Src.Q(), MemSrc); break; - case 32: { + case IR::OpSize::i8Bit: strb(Src, MemSrc); break; + case IR::OpSize::i16Bit: strh(Src, MemSrc); break; + case IR::OpSize::i32Bit: str(Src.S(), MemSrc); break; + case IR::OpSize::i64Bit: str(Src.D(), MemSrc); break; + case IR::OpSize::i128Bit: str(Src.Q(), MemSrc); break; + case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Operand = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); st1b(Src.Z(), PRED_TMP_32B, Operand); @@ -2125,66 +2130,66 @@ DEF_OP(ParanoidLoadMemTSO) { (void)IsInlineConstant(Op->Offset, &Offset); } - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment const auto Dst = GetReg(Node); ldapurb(Dst, MemReg, Offset); } else { switch (OpSize) { - case 2: ldapurh(Dst, MemReg, Offset); break; - case 4: ldapur(Dst.W(), MemReg, Offset); break; - case 8: ldapur(Dst.X(), MemReg, Offset); break; + case IR::OpSize::i16Bit: ldapurh(Dst, MemReg, Offset); break; + case IR::OpSize::i32Bit: ldapur(Dst.W(), MemReg, Offset); break; + case IR::OpSize::i64Bit: ldapur(Dst.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled ParanoidLoadMemTSO size: {}", OpSize); break; } } } else if (CTX->HostFeatures.SupportsRCPC && Op->Class == FEXCore::IR::GPRClass) { const auto Dst = GetReg(Node); - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment ldaprb(Dst.W(), MemReg); } else { switch (OpSize) { - case 2: ldaprh(Dst.W(), MemReg); break; - case 4: ldapr(Dst.W(), MemReg); break; - case 8: ldapr(Dst.X(), MemReg); break; + case IR::OpSize::i16Bit: ldaprh(Dst.W(), MemReg); break; + case IR::OpSize::i32Bit: ldapr(Dst.W(), MemReg); break; + case IR::OpSize::i64Bit: ldapr(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled ParanoidLoadMemTSO size: {}", OpSize); break; } } } else if (Op->Class == FEXCore::IR::GPRClass) { const auto Dst = GetReg(Node); switch (OpSize) { - case 1: ldarb(Dst, MemReg); break; - case 2: ldarh(Dst, MemReg); break; - case 4: ldar(Dst.W(), MemReg); break; - case 8: ldar(Dst.X(), MemReg); break; + case IR::OpSize::i8Bit: ldarb(Dst, MemReg); break; + case IR::OpSize::i16Bit: ldarh(Dst, MemReg); break; + case IR::OpSize::i32Bit: ldar(Dst.W(), MemReg); break; + case IR::OpSize::i64Bit: ldar(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled ParanoidLoadMemTSO size: {}", OpSize); break; } } else { const auto Dst = GetVReg(Node); switch (OpSize) { - case 1: + case IR::OpSize::i8Bit: ldarb(TMP1, MemReg); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1.W()); break; - case 2: + case IR::OpSize::i16Bit: ldarh(TMP1, MemReg); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1.W()); break; - case 4: + case IR::OpSize::i32Bit: ldar(TMP1.W(), MemReg); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1.W()); break; - case 8: + case IR::OpSize::i64Bit: ldar(TMP1, MemReg); fmov(ARMEmitter::Size::i64Bit, Dst.D(), TMP1); break; - case 16: + case IR::OpSize::i128Bit: ldaxp(ARMEmitter::Size::i64Bit, TMP1, TMP2, MemReg); clrex(); ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, TMP1); ins(ARMEmitter::SubRegSize::i64Bit, Dst, 1, TMP2); break; - case 32: + case IR::OpSize::i256Bit: LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); dmb(ARMEmitter::BarrierScope::ISH); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), MemReg); @@ -2208,47 +2213,47 @@ DEF_OP(ParanoidStoreMemTSO) { (void)IsInlineConstant(Op->Offset, &Offset); } - if (OpSize == 1) { + if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment stlurb(Src, MemReg, Offset); } else { switch (OpSize) { - case 2: stlurh(Src, MemReg, Offset); break; - case 4: stlur(Src.W(), MemReg, Offset); break; - case 8: stlur(Src.X(), MemReg, Offset); break; + case IR::OpSize::i16Bit: stlurh(Src, MemReg, Offset); break; + case IR::OpSize::i32Bit: stlur(Src.W(), MemReg, Offset); break; + case IR::OpSize::i64Bit: stlur(Src.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled ParanoidStoreMemTSO size: {}", OpSize); break; } } } else if (Op->Class == FEXCore::IR::GPRClass) { const auto Src = GetReg(Op->Value.ID()); switch (OpSize) { - case 1: stlrb(Src, MemReg); break; - case 2: stlrh(Src, MemReg); break; - case 4: stlr(Src.W(), MemReg); break; - case 8: stlr(Src.X(), MemReg); break; + case IR::OpSize::i8Bit: stlrb(Src, MemReg); break; + case IR::OpSize::i16Bit: stlrh(Src, MemReg); break; + case IR::OpSize::i32Bit: stlr(Src.W(), MemReg); break; + case IR::OpSize::i64Bit: stlr(Src.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled ParanoidStoreMemTSO size: {}", OpSize); break; } } else { const auto Src = GetVReg(Op->Value.ID()); switch (OpSize) { - case 1: + case IR::OpSize::i8Bit: umov(TMP1, Src, 0); stlrb(TMP1, MemReg); break; - case 2: + case IR::OpSize::i16Bit: umov(TMP1, Src, 0); stlrh(TMP1, MemReg); break; - case 4: + case IR::OpSize::i32Bit: umov(TMP1, Src, 0); stlr(TMP1.W(), MemReg); break; - case 8: + case IR::OpSize::i64Bit: umov(TMP1, Src, 0); stlr(TMP1, MemReg); break; - case 16: { + case IR::OpSize::i128Bit: { // Move vector to GPRs umov(TMP1, Src, 0); umov(TMP2, Src, 1); @@ -2261,7 +2266,7 @@ DEF_OP(ParanoidStoreMemTSO) { cbnz(ARMEmitter::Size::i64Bit, TMP3, &B); // < Overwritten with DMB break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); dmb(ARMEmitter::BarrierScope::ISH); st1b(Src.Z(), PRED_TMP_32B, MemReg, 0); @@ -2351,7 +2356,7 @@ DEF_OP(Prefetch) { const auto MemReg = GetReg(Op->Addr.ID()); // Access size is only ever handled as 8-byte. Even though it is accesssed as a cacheline. - const auto MemSrc = GenerateMemOperand(8, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); + const auto MemSrc = GenerateMemOperand(IR::OpSize::i64Bit, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); size_t LUT = (Op->Stream ? 1 : 0) | ((Op->CacheLevel - 1) << 1) | (Op->ForStore ? 1U << 3 : 0); @@ -2388,9 +2393,9 @@ DEF_OP(VStoreNonTemporal) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Value = GetVReg(Op->Value.ID()); const auto MemReg = GetReg(Op->Addr.ID()); @@ -2414,7 +2419,7 @@ DEF_OP(VStoreNonTemporalPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - [[maybe_unused]] const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = OpSize == IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(Is128Bit, "This IR operation only operates at 128-bit wide"); const auto ValueLow = GetVReg(Op->ValueLow.ID()); @@ -2430,9 +2435,9 @@ DEF_OP(VLoadNonTemporal) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto MemReg = GetReg(Op->Addr.ID()); diff --git a/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp b/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp index bcdbd9dfc8..b33bc42103 100644 --- a/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp @@ -19,7 +19,7 @@ namespace FEXCore::CPU { const auto OpSize = IROp->Size; \ \ const auto ElementSize = Op->Header.ElementSize; \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ \ @@ -41,7 +41,7 @@ namespace FEXCore::CPU { DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ @@ -60,7 +60,7 @@ namespace FEXCore::CPU { const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ \ @@ -81,7 +81,7 @@ namespace FEXCore::CPU { const auto OpSize = IROp->Size; \ \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ @@ -91,7 +91,7 @@ namespace FEXCore::CPU { if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), VectorLower.Z(), VectorUpper.Z()); \ } else { \ - if (OpSize == 8) { \ + if (OpSize == IR::OpSize::i64Bit) { \ ARMOp(SubRegSize, Dst.D(), VectorLower.D(), VectorUpper.D()); \ } else { \ ARMOp(SubRegSize, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); \ @@ -106,7 +106,7 @@ namespace FEXCore::CPU { \ const auto ElementSize = Op->Header.ElementSize; \ const auto SubRegSize = ConvertSubRegSize248(IROp); \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ @@ -117,15 +117,15 @@ namespace FEXCore::CPU { } else { \ if (ElementSize == OpSize) { \ switch (ElementSize) { \ - case 2: { \ + case IR::OpSize::i16Bit: { \ ARMOp(Dst.H(), Src.H()); \ break; \ } \ - case 4: { \ + case IR::OpSize::i32Bit: { \ ARMOp(Dst.S(), Src.S()); \ break; \ } \ - case 8: { \ + case IR::OpSize::i64Bit: { \ ARMOp(Dst.D(), Src.D()); \ break; \ } \ @@ -144,7 +144,7 @@ namespace FEXCore::CPU { \ const auto ElementSize = Op->Header.ElementSize; \ const auto SubRegSize = ConvertSubRegSize248(IROp); \ - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; \ + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto IsScalar = ElementSize == OpSize; \ \ @@ -157,15 +157,15 @@ namespace FEXCore::CPU { } else { \ if (IsScalar) { \ switch (ElementSize) { \ - case 2: { \ + case IR::OpSize::i16Bit: { \ ARMOp(Dst.H(), Vector1.H(), Vector2.H()); \ break; \ } \ - case 4: { \ + case IR::OpSize::i32Bit: { \ ARMOp(Dst.S(), Vector1.S(), Vector2.S()); \ break; \ } \ - case 8: { \ + case IR::OpSize::i64Bit: { \ ARMOp(Dst.D(), Vector1.D(), Vector2.D()); \ break; \ } \ @@ -201,11 +201,11 @@ namespace FEXCore::CPU { \ auto ScalarEmit = \ [this, ElementSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, ARMEmitter::VRegister Src3) { \ - if (ElementSize == 2) { \ + if (ElementSize == IR::OpSize::i16Bit) { \ ARMOp(Dst.H(), Src1.H(), Src2.H(), Src3.H()); \ - } else if (ElementSize == 4) { \ + } else if (ElementSize == IR::OpSize::i32Bit) { \ ARMOp(Dst.S(), Src1.S(), Src2.S(), Src3.S()); \ - } else if (ElementSize == 8) { \ + } else if (ElementSize == IR::OpSize::i64Bit) { \ ARMOp(Dst.D(), Src1.D(), Src2.D(), Src3.D()); \ } \ }; \ @@ -260,15 +260,17 @@ DEF_FMAOP_SCALAR_INSERT(VFMLSScalarInsert, fnmsub) DEF_FMAOP_SCALAR_INSERT(VFNMLAScalarInsert, fmsub) DEF_FMAOP_SCALAR_INSERT(VFNMLSScalarInsert, fnmadd) -void Arm64JITCore::VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, +void Arm64JITCore::VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend) { - LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "256-bit unsupported", __func__); + LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "256-bit unsupported", __func__); + + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid" + " size"); + const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : + ARMEmitter::SubRegSize::i64Bit); - LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size"); - const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit : - ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit : - ARMEmitter::SubRegSize::i64Bit); if (Dst != Upper) { // If destination is not tied, move the upper bits to the destination first. mov(Dst.Q(), Upper.Q()); @@ -289,18 +291,19 @@ void Arm64JITCore::VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, Sca // storing it into Dst. This is a scalar operation, so the only lowest element of each vector is used for the operation. // The result is stored into the destination. The untouched bits of the destination come from Vector1, unless it's a 256 vector // and ZeroUpperBits is true, in which case the upper bits are zero. -void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, +void Arm64JITCore::VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2) { - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__); // Bit of a tricky detail. // The upper bits of the destination comes from Vector1. - LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size"); - const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit : - ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit : - ARMEmitter::SubRegSize::i64Bit); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid" + " size"); + const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : + ARMEmitter::SubRegSize::i64Bit); constexpr auto Predicate = ARMEmitter::PReg::p0; @@ -361,17 +364,18 @@ void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool Z // However the result of the scalar operation is inserted into Vector1 and moved to Destination. // The untouched bits of the destination come from Vector1, unless it's a 256 vector // and ZeroUpperBits is true, in which case the upper bits are zero. -void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, +void Arm64JITCore::VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, std::variant Vector2) { - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__); - LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size"); - const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit : - ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit : - ARMEmitter::SubRegSize::i64Bit); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid" + " size"); + const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : + ARMEmitter::SubRegSize::i64Bit); constexpr auto Predicate = ARMEmitter::PReg::p0; bool DstOverlapsVector2 = false; @@ -532,7 +536,7 @@ DEF_OP(VFRSqrtScalarInsert) { ScalarEmit, ScalarEmitRPRES, }; - const auto HandlerIndex = ElementSize == 4 && HostSupportsRPRES ? 1 : 0; + const auto HandlerIndex = ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES ? 1 : 0; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. @@ -564,7 +568,7 @@ DEF_OP(VFRecpScalarInsert) { ScalarEmit, ScalarEmitRPRES, }; - const auto HandlerIndex = ElementSize == 4 && HostSupportsRPRES ? 1 : 0; + const auto HandlerIndex = ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES ? 1 : 0; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. @@ -578,7 +582,7 @@ DEF_OP(VFRecpScalarInsert) { DEF_OP(VFToFScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; - const uint16_t Conv = (Op->Header.ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); + const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto ScalarEmit = [this, Conv](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); @@ -626,14 +630,14 @@ DEF_OP(VSToFVectorInsert) { const auto ElementSize = Op->Header.ElementSize; const auto HasTwoElements = Op->HasTwoElements; - LOGMAN_THROW_AA_FMT(ElementSize == 4 || ElementSize == 8, "Invalid size"); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid size"); if (HasTwoElements) { - LOGMAN_THROW_AA_FMT(ElementSize == 4, "Can't have two elements for 8-byte size"); + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i32Bit, "Can't have two elements for 8-byte size"); } auto ScalarEmit = [this, ElementSize, HasTwoElements](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); - if (ElementSize == 4) { + if (ElementSize == IR::OpSize::i32Bit) { if (HasTwoElements) { scvtf(ARMEmitter::SubRegSize::i32Bit, Dst.D(), Src.D()); } else { @@ -659,7 +663,7 @@ DEF_OP(VSToFVectorInsert) { } // Dealing with the odd case of this being actually a vector operation rather than scalar. - const auto Is256Bit = IROp->Size == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = IROp->Size == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); constexpr auto Predicate = ARMEmitter::PReg::p0; @@ -681,8 +685,8 @@ DEF_OP(VSToFVectorInsert) { DEF_OP(VSToFGPRInsert) { const auto Op = IROp->C(); - const uint16_t ElementSize = Op->Header.ElementSize; - const uint16_t Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); + const auto ElementSize = Op->Header.ElementSize; + const uint16_t Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto ScalarEmit = [this, Conv](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); @@ -759,7 +763,7 @@ DEF_OP(VFCMPScalarInsert) { const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto ZeroUpperBits = Op->ZeroUpperBits; - const auto Is256Bit = IROp->Size == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = IROp->Size == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); auto ScalarEmitEQ = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { @@ -917,7 +921,7 @@ DEF_OP(VectorImm) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -926,7 +930,7 @@ DEF_OP(VectorImm) { if (HostSupportsSVE256 && Is256Bit) { LOGMAN_THROW_A_FMT(Op->ShiftAmount == 0, "SVE VectorImm doesn't support a shift"); - if (ElementSize > 1 && (Op->Immediate & 0x80)) { + if (ElementSize > IR::OpSize::i8Bit && (Op->Immediate & 0x80)) { // SVE dup uses sign extension where VectorImm wants zext LoadConstant(ARMEmitter::Size::i64Bit, TMP1, Op->Immediate); dup(SubRegSize, Dst.Z(), TMP1); @@ -934,7 +938,7 @@ DEF_OP(VectorImm) { dup_imm(SubRegSize, Dst.Z(), static_cast(Op->Immediate)); } } else { - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { // movi with 64bit element size doesn't do what we want here LoadConstant(ARMEmitter::Size::i64Bit, TMP1, static_cast(Op->Immediate) << Op->ShiftAmount); dup(SubRegSize, Dst.Q(), TMP1.R()); @@ -965,11 +969,12 @@ DEF_OP(LoadNamedVectorConstant) { } } // Load the pointer. - auto GenerateMemOperand = [this](uint8_t OpSize, uint32_t NamedConstant, ARMEmitter::Register Base) { + auto GenerateMemOperand = [this](IR::OpSize OpSize, uint32_t NamedConstant, ARMEmitter::Register Base) { const auto ConstantOffset = offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.NamedVectorConstants[NamedConstant]); if (ConstantOffset <= 255 || // Unscaled 9-bit signed - ((ConstantOffset & (OpSize - 1)) == 0 && FEXCore::DividePow2(ConstantOffset, OpSize) <= 4095)) /* 12-bit unsigned scaled */ { + ((ConstantOffset & (IR::OpSizeToSize(OpSize) - 1)) == 0 && + FEXCore::DividePow2(ConstantOffset, IR::OpSizeToSize(OpSize)) <= 4095)) /* 12-bit unsigned scaled */ { return ARMEmitter::ExtendedMemOperand(Base.X(), ARMEmitter::IndexType::OFFSET, ConstantOffset); } @@ -977,7 +982,7 @@ DEF_OP(LoadNamedVectorConstant) { return ARMEmitter::ExtendedMemOperand(TMP1, ARMEmitter::IndexType::OFFSET, 0); }; - if (OpSize == 32) { + if (OpSize == IR::OpSize::i256Bit) { // Handle SVE 32-byte variant upfront. ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.NamedVectorConstantPointers[Op->Constant])); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, 0); @@ -986,11 +991,11 @@ DEF_OP(LoadNamedVectorConstant) { auto MemOperand = GenerateMemOperand(OpSize, Op->Constant, STATE); switch (OpSize) { - case 1: ldrb(Dst, MemOperand); break; - case 2: ldrh(Dst, MemOperand); break; - case 4: ldr(Dst.S(), MemOperand); break; - case 8: ldr(Dst.D(), MemOperand); break; - case 16: ldr(Dst.Q(), MemOperand); break; + case IR::OpSize::i8Bit: ldrb(Dst, MemOperand); break; + case IR::OpSize::i16Bit: ldrh(Dst, MemOperand); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), MemOperand); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), MemOperand); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), MemOperand); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, OpSize); break; } } @@ -1004,12 +1009,12 @@ DEF_OP(LoadNamedVectorIndexedConstant) { ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.IndexedNamedVectorConstantPointers[Op->Constant])); switch (OpSize) { - case 1: ldrb(Dst, TMP1, Op->Index); break; - case 2: ldrh(Dst, TMP1, Op->Index); break; - case 4: ldr(Dst.S(), TMP1, Op->Index); break; - case 8: ldr(Dst.D(), TMP1, Op->Index); break; - case 16: ldr(Dst.Q(), TMP1, Op->Index); break; - case 32: { + case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->Index); break; + case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->Index); break; + case IR::OpSize::i32Bit: ldr(Dst.S(), TMP1, Op->Index); break; + case IR::OpSize::i64Bit: ldr(Dst.D(), TMP1, Op->Index); break; + case IR::OpSize::i128Bit: ldr(Dst.Q(), TMP1, Op->Index); break; + case IR::OpSize::i256Bit: { add(ARMEmitter::Size::i64Bit, TMP1, TMP1, Op->Index); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, 0); break; @@ -1026,35 +1031,35 @@ DEF_OP(VMov) { const auto Source = GetVReg(Op->Source.ID()); switch (OpSize) { - case 1: { + case IR::OpSize::i8Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i8Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } - case 2: { + case IR::OpSize::i16Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } - case 4: { + case IR::OpSize::i32Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i32Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } - case 8: { + case IR::OpSize::i64Bit: { mov(Dst.D(), Source.D()); break; } - case 16: { + case IR::OpSize::i128Bit: { if (HostSupportsSVE256 || Dst.Idx() != Source.Idx()) { mov(Dst.Q(), Source.Q()); } break; } - case 32: { + case IR::OpSize::i256Bit: { // NOTE: If, in the distant future we support larger moves, or registers // (*cough* AVX-512 *cough*) make sure to change this to treat // 256-bit moves with zero extending behavior instead of doing only @@ -1071,8 +1076,8 @@ DEF_OP(VMov) { DEF_OP(VAddP) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto IsScalar = OpSize == 8; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto IsScalar = OpSize == IR::OpSize::i64Bit; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -1108,7 +1113,7 @@ DEF_OP(VAddP) { DEF_OP(VFAddV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = Op->Header.ElementSize; @@ -1117,8 +1122,8 @@ DEF_OP(VFAddV) { const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); - LOGMAN_THROW_AA_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE || OpSize == Core::CPUState::XMM_AVX_REG_SIZE, "Only AVX and SSE size " - "supported"); + LOGMAN_THROW_AA_FMT(OpSize == IR::OpSize::i128Bit || OpSize == IR::OpSize::i256Bit, "Only AVX and SSE size " + "supported"); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); faddv(SubRegSize.Vector, Dst, Pred, Vector.Z()); @@ -1128,7 +1133,7 @@ DEF_OP(VFAddV) { faddv(SubRegSize.Vector, Dst, Pred, Vector.Z()); } else { // ASIMD doesn't support faddv, need to use multiple faddp to match behaviour. - if (ElementSize == 4) { + if (ElementSize == IR::OpSize::i32Bit) { faddp(SubRegSize.Vector, Dst.Q(), Vector.Q(), Vector.Q()); faddp(SubRegSize.Scalar, Dst, Dst); } else { @@ -1143,7 +1148,7 @@ DEF_OP(VAddV) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1165,7 +1170,7 @@ DEF_OP(VAddV) { addv(SubRegSize.Vector, VTMP1.Q(), VTMP1.Q()); add(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q()); } else { - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { addp(SubRegSize.Scalar, Dst, Vector); } else { addv(SubRegSize.Vector, Dst.Q(), Vector.Q()); @@ -1177,7 +1182,7 @@ DEF_OP(VUMinV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -1198,7 +1203,7 @@ DEF_OP(VUMaxV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -1220,7 +1225,7 @@ DEF_OP(VURAvg) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1252,7 +1257,7 @@ DEF_OP(VFAddP) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize248(IROp); @@ -1289,7 +1294,7 @@ DEF_OP(VFDiv) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1318,15 +1323,15 @@ DEF_OP(VFDiv) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fdiv(Dst.H(), Vector1.H(), Vector2.H()); break; } - case 4: { + case IR::OpSize::i32Bit: { fdiv(Dst.S(), Vector1.S(), Vector2.S()); break; } - case 8: { + case IR::OpSize::i64Bit: { fdiv(Dst.D(), Vector1.D(), Vector2.D()); break; } @@ -1345,7 +1350,7 @@ DEF_OP(VFMin) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1411,7 +1416,7 @@ DEF_OP(VFMax) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1463,7 +1468,7 @@ DEF_OP(VFRecp) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = Op->Header.ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1472,7 +1477,7 @@ DEF_OP(VFRecp) { if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frecpe(SubRegSize.Vector, Dst.Z(), Vector.Z()); return; @@ -1483,7 +1488,7 @@ DEF_OP(VFRecp) { mov(Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frecpe(SubRegSize.Scalar, Dst.S(), Vector.S()); return; @@ -1491,24 +1496,24 @@ DEF_OP(VFRecp) { fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f); switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fdiv(Dst.H(), VTMP1.H(), Vector.H()); break; } - case 4: { + case IR::OpSize::i32Bit: { fdiv(Dst.S(), VTMP1.S(), Vector.S()); break; } - case 8: { + case IR::OpSize::i64Bit: { fdiv(Dst.D(), VTMP1.D(), Vector.D()); break; } default: break; } } else { - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { frecpe(SubRegSize.Vector, Dst.D(), Vector.D()); } else { frecpe(SubRegSize.Vector, Dst.Q(), Vector.Q()); @@ -1529,7 +1534,7 @@ DEF_OP(VFRSqrt) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1537,7 +1542,7 @@ DEF_OP(VFRSqrt) { if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frsqrte(SubRegSize.Vector, Dst.Z(), Vector.Z()); return; @@ -1548,7 +1553,7 @@ DEF_OP(VFRSqrt) { fdiv(SubRegSize.Vector, Dst.Z(), Pred, Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frsqrte(SubRegSize.Scalar, Dst.S(), Vector.S()); return; @@ -1556,17 +1561,17 @@ DEF_OP(VFRSqrt) { fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0); switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fsqrt(VTMP2.H(), Vector.H()); fdiv(Dst.H(), VTMP1.H(), VTMP2.H()); break; } - case 4: { + case IR::OpSize::i32Bit: { fsqrt(VTMP2.S(), Vector.S()); fdiv(Dst.S(), VTMP1.S(), VTMP2.S()); break; } - case 8: { + case IR::OpSize::i64Bit: { fsqrt(VTMP2.D(), Vector.D()); fdiv(Dst.D(), VTMP1.D(), VTMP2.D()); break; @@ -1574,9 +1579,9 @@ DEF_OP(VFRSqrt) { default: break; } } else { - if (ElementSize == 4 && HostSupportsRPRES) { + if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { frsqrte(SubRegSize.Vector, Dst.D(), Vector.D()); } else { frsqrte(SubRegSize.Vector, Dst.Q(), Vector.Q()); @@ -1594,7 +1599,7 @@ DEF_OP(VFRSqrt) { DEF_OP(VNot) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1613,7 +1618,7 @@ DEF_OP(VUMin) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1638,13 +1643,13 @@ DEF_OP(VUMin) { } } else { switch (ElementSize) { - case 1: - case 2: - case 4: { + case IR::OpSize::i8Bit: + case IR::OpSize::i16Bit: + case IR::OpSize::i32Bit: { umin(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } - case 8: { + case IR::OpSize::i64Bit: { cmhi(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); @@ -1662,7 +1667,7 @@ DEF_OP(VSMin) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1687,13 +1692,13 @@ DEF_OP(VSMin) { } } else { switch (ElementSize) { - case 1: - case 2: - case 4: { + case IR::OpSize::i8Bit: + case IR::OpSize::i16Bit: + case IR::OpSize::i32Bit: { smin(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } - case 8: { + case IR::OpSize::i64Bit: { cmgt(SubRegSize, VTMP1.Q(), Vector1.Q(), Vector2.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); @@ -1711,7 +1716,7 @@ DEF_OP(VUMax) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1736,13 +1741,13 @@ DEF_OP(VUMax) { } } else { switch (ElementSize) { - case 1: - case 2: - case 4: { + case IR::OpSize::i8Bit: + case IR::OpSize::i16Bit: + case IR::OpSize::i32Bit: { umax(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } - case 8: { + case IR::OpSize::i64Bit: { cmhi(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); @@ -1760,7 +1765,7 @@ DEF_OP(VSMax) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1785,13 +1790,13 @@ DEF_OP(VSMax) { } } else { switch (ElementSize) { - case 1: - case 2: - case 4: { + case IR::OpSize::i8Bit: + case IR::OpSize::i16Bit: + case IR::OpSize::i32Bit: { smax(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } - case 8: { + case IR::OpSize::i64Bit: { cmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); @@ -1806,10 +1811,10 @@ DEF_OP(VSMax) { DEF_OP(VBSL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto VectorFalse = GetVReg(Op->VectorFalse.ID()); @@ -1838,28 +1843,28 @@ DEF_OP(VBSL) { } else { if (VectorMask == Dst) { // Can use BSL without any moves. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { bsl(Dst.D(), VectorTrue.D(), VectorFalse.D()); } else { bsl(Dst.Q(), VectorTrue.Q(), VectorFalse.Q()); } } else if (VectorTrue == Dst) { // Can use BIF without any moves. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { bif(Dst.D(), VectorFalse.D(), VectorMask.D()); } else { bif(Dst.Q(), VectorFalse.Q(), VectorMask.Q()); } } else if (VectorFalse == Dst) { // Can use BIT without any moves. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { bit(Dst.D(), VectorTrue.D(), VectorMask.D()); } else { bit(Dst.Q(), VectorTrue.Q(), VectorMask.Q()); } } else { // Needs moves. - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { mov(Dst.D(), VectorMask.D()); bsl(Dst.D(), VectorTrue.D(), VectorFalse.D()); } else { @@ -1877,7 +1882,7 @@ DEF_OP(VCMPEQ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1917,7 +1922,7 @@ DEF_OP(VCMPEQZ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1959,7 +1964,7 @@ DEF_OP(VCMPGT) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -1999,7 +2004,7 @@ DEF_OP(VCMPGTZ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2038,7 +2043,7 @@ DEF_OP(VCMPLTZ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2077,7 +2082,7 @@ DEF_OP(VFCMPEQ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2095,12 +2100,12 @@ DEF_OP(VFCMPEQ) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmeq(Dst.H(), Vector1.H(), Vector2.H()); break; } - case 4: - case 8: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } } else { @@ -2116,7 +2121,7 @@ DEF_OP(VFCMPNEQ) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2134,12 +2139,12 @@ DEF_OP(VFCMPNEQ) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmeq(Dst.H(), Vector1.H(), Vector2.H()); break; } - case 4: - case 8: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } mvn(ARMEmitter::SubRegSize::i8Bit, Dst.D(), Dst.D()); @@ -2157,7 +2162,7 @@ DEF_OP(VFCMPLT) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2175,12 +2180,12 @@ DEF_OP(VFCMPLT) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmgt(Dst.H(), Vector2.H(), Vector1.H()); break; } - case 4: - case 8: fcmgt(SubRegSize.Scalar, Dst, Vector2, Vector1); break; + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmgt(SubRegSize.Scalar, Dst, Vector2, Vector1); break; default: break; } } else { @@ -2196,7 +2201,7 @@ DEF_OP(VFCMPGT) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2214,12 +2219,12 @@ DEF_OP(VFCMPGT) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmgt(Dst.H(), Vector1.H(), Vector2.H()); break; } - case 4: - case 8: fcmgt(SubRegSize.Scalar, Dst, Vector1, Vector2); break; + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmgt(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } } else { @@ -2235,7 +2240,7 @@ DEF_OP(VFCMPLE) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2253,12 +2258,12 @@ DEF_OP(VFCMPLE) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmge(Dst.H(), Vector2.H(), Vector1.H()); break; } - case 4: - case 8: fcmge(SubRegSize.Scalar, Dst, Vector2, Vector1); break; + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, Dst, Vector2, Vector1); break; default: break; } } else { @@ -2274,7 +2279,7 @@ DEF_OP(VFCMPORD) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2296,14 +2301,14 @@ DEF_OP(VFCMPORD) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmge(VTMP1.H(), Vector1.H(), Vector2.H()); fcmgt(VTMP2.H(), Vector2.H(), Vector1.H()); orr(Dst.D(), VTMP1.D(), VTMP2.D()); break; } - case 4: - case 8: + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Vector1, Vector2); fcmgt(SubRegSize.Scalar, VTMP2, Vector2, Vector1); orr(Dst.D(), VTMP1.D(), VTMP2.D()); @@ -2325,7 +2330,7 @@ DEF_OP(VFCMPUNO) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2343,15 +2348,15 @@ DEF_OP(VFCMPUNO) { } else { if (IsScalar) { switch (ElementSize) { - case 2: { + case IR::OpSize::i16Bit: { fcmge(VTMP1.H(), Vector1.H(), Vector2.H()); fcmgt(VTMP2.H(), Vector2.H(), Vector1.H()); orr(Dst.D(), VTMP1.D(), VTMP2.D()); mvn(ARMEmitter::SubRegSize::i8Bit, Dst.D(), Dst.D()); break; } - case 4: - case 8: + case IR::OpSize::i32Bit: + case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Vector1, Vector2); fcmgt(SubRegSize.Scalar, VTMP2, Vector2, Vector1); orr(Dst.D(), VTMP1.D(), VTMP2.D()); @@ -2374,10 +2379,10 @@ DEF_OP(VUShl) { const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto MaxShift = ElementSize * 8; + const auto MaxShift = IR::OpSizeAsBits(ElementSize); const auto Dst = GetVReg(Node); auto ShiftVector = GetVReg(Op->ShiftVector.ID()); @@ -2406,7 +2411,7 @@ DEF_OP(VUShl) { lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { - if (ElementSize < 8) { + if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { @@ -2430,10 +2435,10 @@ DEF_OP(VUShr) { const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto MaxShift = ElementSize * 8; + const auto MaxShift = IR::OpSizeAsBits(ElementSize); const auto Dst = GetVReg(Node); auto ShiftVector = GetVReg(Op->ShiftVector.ID()); @@ -2462,7 +2467,7 @@ DEF_OP(VUShr) { lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { - if (ElementSize < 8) { + if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { @@ -2489,10 +2494,10 @@ DEF_OP(VSShr) { const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto MaxShift = (ElementSize * 8) - 1; + const auto MaxShift = IR::OpSizeAsBits(ElementSize) - 1; const auto RangeCheck = Op->RangeCheck; const auto Dst = GetVReg(Node); @@ -2521,7 +2526,7 @@ DEF_OP(VSShr) { asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { - if (ElementSize < 8) { + if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { @@ -2547,7 +2552,7 @@ DEF_OP(VUShlS) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2576,7 +2581,7 @@ DEF_OP(VUShrS) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2607,7 +2612,7 @@ DEF_OP(VUShrSWide) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2622,7 +2627,7 @@ DEF_OP(VUShrSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { lsr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); @@ -2631,7 +2636,7 @@ DEF_OP(VUShrSWide) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; - if (OpSize > 8) { + if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); @@ -2648,7 +2653,7 @@ DEF_OP(VUShrSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { lsr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); @@ -2673,7 +2678,7 @@ DEF_OP(VSShrSWide) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2688,7 +2693,7 @@ DEF_OP(VSShrSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { asr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); @@ -2697,7 +2702,7 @@ DEF_OP(VSShrSWide) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; - if (OpSize > 8) { + if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); @@ -2714,7 +2719,7 @@ DEF_OP(VSShrSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { asr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); @@ -2739,7 +2744,7 @@ DEF_OP(VUShlSWide) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2754,7 +2759,7 @@ DEF_OP(VUShlSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { lsl_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); @@ -2763,7 +2768,7 @@ DEF_OP(VUShlSWide) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; - if (OpSize > 8) { + if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); @@ -2780,7 +2785,7 @@ DEF_OP(VUShlSWide) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } - if (ElementSize == 8) { + if (ElementSize == IR::OpSize::i64Bit) { lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { lsl_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); @@ -2803,7 +2808,7 @@ DEF_OP(VSShrS) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -2831,10 +2836,10 @@ DEF_OP(VSShrS) { DEF_OP(VInsElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const uint32_t ElementSize = Op->Header.ElementSize; + const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const uint32_t DestIdx = Op->DestIdx; @@ -2858,7 +2863,7 @@ DEF_OP(VInsElement) { constexpr auto Predicate = ARMEmitter::PReg::p0; - if (ElementSize == 16) { + if (ElementSize == IR::OpSize::i128Bit) { if (DestIdx == 0) { mov(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), PRED_TMP_16B.Merging(), VTMP2.Z()); } else { @@ -2866,7 +2871,7 @@ DEF_OP(VInsElement) { mov(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), Predicate.Merging(), VTMP2.Z()); } } else { - const auto UpperBound = 16 >> FEXCore::ilog2(ElementSize); + const auto UpperBound = 16 >> FEXCore::ilog2(IR::OpSizeToSize(ElementSize)); const auto TargetElement = static_cast(DestIdx) - UpperBound; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. @@ -2914,10 +2919,10 @@ DEF_OP(VDupElement) { const auto Index = Op->Index; const auto SubRegSize = ConvertSubRegSize16(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); @@ -2936,7 +2941,7 @@ DEF_OP(VDupElement) { DEF_OP(VExtr) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); // AArch64 ext op has bit arrangement as [Vm:Vn] so arguments need to be swapped @@ -2947,17 +2952,17 @@ DEF_OP(VExtr) { const auto ElementSize = Op->Header.ElementSize; auto Index = Op->Index; - if (Index >= OpSize) { + if (Index >= IR::OpSizeToSize(OpSize)) { // Upper bits have moved in to the lower bits LowerBits = UpperBits; // Upper bits are all now zero UpperBits = VTMP1; movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); - Index -= OpSize; + Index -= IR::OpSizeToSize(OpSize); } - const auto CopyFromByte = Index * ElementSize; + const auto CopyFromByte = Index * IR::OpSizeToSize(ElementSize); if (HostSupportsSVE256 && Is256Bit) { if (Dst == LowerBits) { @@ -2973,7 +2978,7 @@ DEF_OP(VExtr) { ext(Dst.Z(), Dst.Z(), UpperBits.Z(), CopyFromByte); } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { ext(Dst.D(), LowerBits.D(), UpperBits.D(), CopyFromByte); } else { ext(Dst.Q(), LowerBits.Q(), UpperBits.Q(), CopyFromByte); @@ -2988,13 +2993,13 @@ DEF_OP(VUShrI) { const auto BitShift = Op->BitShift; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); - if (BitShift >= (ElementSize * 8)) { + if (BitShift >= IR::OpSizeAsBits(ElementSize)) { movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); } else { if (HostSupportsSVE256 && Is256Bit) { @@ -3030,7 +3035,7 @@ DEF_OP(VUShraI) { const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3072,8 +3077,8 @@ DEF_OP(VSShrI) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Shift = std::min(uint8_t(ElementSize * 8 - 1), Op->BitShift); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Shift = std::min(IR::OpSizeAsBits(ElementSize) - 1, Op->BitShift); + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3112,13 +3117,13 @@ DEF_OP(VShlI) { const auto BitShift = Op->BitShift; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); - if (BitShift >= (ElementSize * 8)) { + if (BitShift >= IR::OpSizeAsBits(ElementSize)) { movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); } else { if (HostSupportsSVE256 && Is256Bit) { @@ -3154,7 +3159,7 @@ DEF_OP(VUShrNI) { const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3174,7 +3179,7 @@ DEF_OP(VUShrNI2) { const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3211,7 +3216,7 @@ DEF_OP(VSXTL) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3229,7 +3234,7 @@ DEF_OP(VSXTL2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3247,14 +3252,14 @@ DEF_OP(VSSHLL) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); const auto BitShift = Op->BitShift; - LOGMAN_THROW_A_FMT(BitShift < ((IROp->ElementSize >> 1) * 8), "Bitshift size too large for source element size: {} < {}", BitShift, - (IROp->ElementSize >> 1) * 8); + LOGMAN_THROW_A_FMT(BitShift < IR::OpSizeAsBits(IROp->ElementSize / 2), "Bitshift size too large for source element size: {} < {}", + BitShift, IR::OpSizeAsBits(IROp->ElementSize / 2)); if (Is256Bit) { sunpklo(SubRegSize, Dst.Z(), Vector.Z()); @@ -3269,14 +3274,14 @@ DEF_OP(VSSHLL2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); const auto BitShift = Op->BitShift; - LOGMAN_THROW_A_FMT(BitShift < ((IROp->ElementSize >> 1) * 8), "Bitshift size too large for source element size: {} < {}", BitShift, - (IROp->ElementSize >> 1) * 8); + LOGMAN_THROW_A_FMT(BitShift < IR::OpSizeAsBits(IROp->ElementSize / 2), "Bitshift size too large for source element size: {} < {}", + BitShift, IR::OpSizeAsBits(IROp->ElementSize / 2)); if (Is256Bit) { sunpkhi(SubRegSize, Dst.Z(), Vector.Z()); @@ -3291,7 +3296,7 @@ DEF_OP(VUXTL) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3309,7 +3314,7 @@ DEF_OP(VUXTL2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3327,7 +3332,7 @@ DEF_OP(VSQXTN) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3379,7 +3384,7 @@ DEF_OP(VSQXTN2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3406,7 +3411,7 @@ DEF_OP(VSQXTN2) { } splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { sqxtn(SubRegSize, VTMP2, VectorUpper); mov(Dst.Q(), VectorLower.Q()); ins(ARMEmitter::SubRegSize::i32Bit, Dst, 1, VTMP2, 0); @@ -3423,7 +3428,7 @@ DEF_OP(VSQXTNPair) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3447,7 +3452,7 @@ DEF_OP(VSQXTNPair) { // Merge. splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { zip1(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); sqxtn(SubRegSize, Dst, Dst); } else { @@ -3467,7 +3472,7 @@ DEF_OP(VSQXTUN) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3486,7 +3491,7 @@ DEF_OP(VSQXTUN2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3507,7 +3512,7 @@ DEF_OP(VSQXTUN2) { } splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { sqxtun(SubRegSize, VTMP2, VectorUpper); mov(Dst.Q(), VectorLower.Q()); ins(ARMEmitter::SubRegSize::i32Bit, Dst, 1, VTMP2, 0); @@ -3532,7 +3537,7 @@ DEF_OP(VSQXTUNPair) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3556,7 +3561,7 @@ DEF_OP(VSQXTUNPair) { // Merge. splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { zip1(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); sqxtun(SubRegSize, Dst, Dst); } else { @@ -3575,7 +3580,7 @@ DEF_OP(VSRSHR) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize16(IROp); @@ -3593,7 +3598,7 @@ DEF_OP(VSRSHR) { } srshr(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { srshr(SubRegSize, Dst.D(), Vector.D(), BitShift); } else { srshr(SubRegSize, Dst.Q(), Vector.Q(), BitShift); @@ -3605,7 +3610,7 @@ DEF_OP(VSQSHL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); @@ -3623,7 +3628,7 @@ DEF_OP(VSQSHL) { } sqshl(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { sqshl(SubRegSize, Dst.D(), Vector.D(), BitShift); } else { sqshl(SubRegSize, Dst.Q(), Vector.Q(), BitShift); @@ -3635,7 +3640,7 @@ DEF_OP(VMul) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize16(IROp); @@ -3656,7 +3661,7 @@ DEF_OP(VUMull) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3677,7 +3682,7 @@ DEF_OP(VSMull) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3698,7 +3703,7 @@ DEF_OP(VUMull2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3719,7 +3724,7 @@ DEF_OP(VSMull2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3741,19 +3746,19 @@ DEF_OP(VUMulH) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1.ID()); const auto Vector2 = GetVReg(Op->Vector2.ID()); - const auto SubRegSizeLarger = ElementSize == 1 ? ARMEmitter::SubRegSize::i16Bit : - ElementSize == 2 ? ARMEmitter::SubRegSize::i32Bit : - ElementSize == 4 ? ARMEmitter::SubRegSize::i64Bit : - ARMEmitter::SubRegSize::i8Bit; + const auto SubRegSizeLarger = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i32Bit : + ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i64Bit : + ARMEmitter::SubRegSize::i8Bit; if (HostSupportsSVE256 && Is256Bit) { umulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); @@ -3775,9 +3780,9 @@ DEF_OP(VUMulH) { } else { umulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } - } else if (OpSize == 8) { + } else if (OpSize == IR::OpSize::i64Bit) { umull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); - shrn(SubRegSize, Dst.D(), Dst.D(), ElementSize * 8); + shrn(SubRegSize, Dst.D(), Dst.D(), IR::OpSizeAsBits(ElementSize)); } else { // ASIMD doesn't have a umulh. Need to emulate. umull2(SubRegSizeLarger, VTMP1.Q(), Vector1.Q(), Vector2.Q()); @@ -3792,19 +3797,19 @@ DEF_OP(VSMulH) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1.ID()); const auto Vector2 = GetVReg(Op->Vector2.ID()); - const auto SubRegSizeLarger = ElementSize == 1 ? ARMEmitter::SubRegSize::i16Bit : - ElementSize == 2 ? ARMEmitter::SubRegSize::i32Bit : - ElementSize == 4 ? ARMEmitter::SubRegSize::i64Bit : - ARMEmitter::SubRegSize::i8Bit; + const auto SubRegSizeLarger = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i16Bit : + ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i32Bit : + ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i64Bit : + ARMEmitter::SubRegSize::i8Bit; if (HostSupportsSVE256 && Is256Bit) { smulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); @@ -3826,9 +3831,9 @@ DEF_OP(VSMulH) { } else { smulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } - } else if (OpSize == 8) { + } else if (OpSize == IR::OpSize::i64Bit) { smull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); - shrn(SubRegSize, Dst.D(), Dst.D(), ElementSize * 8); + shrn(SubRegSize, Dst.D(), Dst.D(), IR::OpSizeAsBits(ElementSize)); } else { // ASIMD doesn't have a umulh. Need to emulate. smull2(SubRegSizeLarger, VTMP1.Q(), Vector1.Q(), Vector2.Q()); @@ -3842,7 +3847,7 @@ DEF_OP(VUABDL) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3868,7 +3873,7 @@ DEF_OP(VUABDL2) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -3898,15 +3903,15 @@ DEF_OP(VTBL1) { const auto VectorTable = GetVReg(Op->VectorTable.ID()); switch (OpSize) { - case 8: { + case IR::OpSize::i64Bit: { tbl(Dst.D(), VectorTable.Q(), VectorIndices.D()); break; } - case 16: { + case IR::OpSize::i128Bit: { tbl(Dst.Q(), VectorTable.Q(), VectorIndices.Q()); break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_AA_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbl(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), VectorTable.Z(), VectorIndices.Z()); @@ -3927,7 +3932,7 @@ DEF_OP(VTBL2) { if (!ARMEmitter::AreVectorsSequential(VectorTable1, VectorTable2)) { // Vector registers aren't sequential, need to move to temporaries. - if (OpSize == 32) { + if (OpSize == IR::OpSize::i256Bit) { mov(VTMP1.Z(), VectorTable1.Z()); mov(VTMP2.Z(), VectorTable2.Z()); } else { @@ -3942,15 +3947,15 @@ DEF_OP(VTBL2) { } switch (OpSize) { - case 8: { + case IR::OpSize::i64Bit: { tbl(Dst.D(), VectorTable1.Q(), VectorTable2.Q(), VectorIndices.D()); break; } - case 16: { + case IR::OpSize::i128Bit: { tbl(Dst.Q(), VectorTable1.Q(), VectorTable2.Q(), VectorIndices.Q()); break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_AA_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbl(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), VectorTable1.Z(), VectorTable2.Z(), VectorIndices.Z()); @@ -3971,19 +3976,19 @@ DEF_OP(VTBX1) { if (Dst != VectorSrcDst) { switch (OpSize) { - case 8: { + case IR::OpSize::i64Bit: { mov(VTMP1.D(), VectorSrcDst.D()); tbx(VTMP1.D(), VectorTable.Q(), VectorIndices.D()); mov(Dst.D(), VTMP1.D()); break; } - case 16: { + case IR::OpSize::i128Bit: { mov(VTMP1.Q(), VectorSrcDst.Q()); tbx(VTMP1.Q(), VectorTable.Q(), VectorIndices.Q()); mov(Dst.Q(), VTMP1.Q()); break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_AA_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); mov(VTMP1.Z(), VectorSrcDst.Z()); tbx(ARMEmitter::SubRegSize::i8Bit, VTMP1.Z(), VectorTable.Z(), VectorIndices.Z()); @@ -3994,15 +3999,15 @@ DEF_OP(VTBX1) { } } else { switch (OpSize) { - case 8: { + case IR::OpSize::i64Bit: { tbx(VectorSrcDst.D(), VectorTable.Q(), VectorIndices.D()); break; } - case 16: { + case IR::OpSize::i128Bit: { tbx(VectorSrcDst.Q(), VectorTable.Q(), VectorIndices.Q()); break; } - case 32: { + case IR::OpSize::i256Bit: { LOGMAN_THROW_AA_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbx(ARMEmitter::SubRegSize::i8Bit, VectorSrcDst.Z(), VectorTable.Z(), VectorIndices.Z()); @@ -4018,31 +4023,31 @@ DEF_OP(VRev32) { const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector.ID()); - LOGMAN_THROW_AA_FMT(ElementSize == 1 || ElementSize == 2, "Invalid size"); - const auto SubRegSize = ElementSize == 1 ? ARMEmitter::SubRegSize::i8Bit : ARMEmitter::SubRegSize::i16Bit; + LOGMAN_THROW_AA_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit, "Invalid size"); + const auto SubRegSize = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i8Bit : ARMEmitter::SubRegSize::i16Bit; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); switch (ElementSize) { - case 1: { + case IR::OpSize::i8Bit: { revb(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Vector.Z()); break; } - case 2: { + case IR::OpSize::i16Bit: { revh(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Vector.Z()); break; } default: LOGMAN_MSG_A_FMT("Invalid Element Size: {}", ElementSize); break; } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { rev32(SubRegSize, Dst.D(), Vector.D()); } else { rev32(SubRegSize, Dst.Q(), Vector.Q()); @@ -4057,7 +4062,7 @@ DEF_OP(VRev64) { const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize4(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -4067,22 +4072,22 @@ DEF_OP(VRev64) { const auto Mask = PRED_TMP_32B.Merging(); switch (ElementSize) { - case 1: { + case IR::OpSize::i8Bit: { revb(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } - case 2: { + case IR::OpSize::i16Bit: { revh(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } - case 4: { + case IR::OpSize::i32Bit: { revw(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } default: LOGMAN_MSG_A_FMT("Invalid Element Size: {}", ElementSize); break; } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { rev64(SubRegSize, Dst.D(), Vector.D()); } else { rev64(SubRegSize, Dst.Q(), Vector.Q()); @@ -4095,7 +4100,7 @@ DEF_OP(VFCADD) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -4124,7 +4129,7 @@ DEF_OP(VFCADD) { fcadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z(), Rotate); } } else { - if (OpSize == 8) { + if (OpSize == IR::OpSize::i64Bit) { fcadd(SubRegSize, Dst.D(), Vector1.D(), Vector2.D(), Rotate); } else { fcadd(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q(), Rotate); @@ -4142,7 +4147,7 @@ DEF_OP(VFMLA) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -4168,11 +4173,11 @@ DEF_OP(VFMLA) { } } else { if (IROp->ElementSize == OpSize) { - if (IROp->ElementSize == 2) { + if (IROp->ElementSize == IR::OpSize::i16Bit) { fmadd(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); - } else if (IROp->ElementSize == 4) { + } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fmadd(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); - } else if (IROp->ElementSize == 8) { + } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fmadd(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; @@ -4186,7 +4191,7 @@ DEF_OP(VFMLA) { } mov(DestTmp.Q(), VectorAddend.Q()); } - if (OpSize == 16) { + if (OpSize == IR::OpSize::i128Bit) { fmla(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmla(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); @@ -4208,8 +4213,8 @@ DEF_OP(VFMLS) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -4251,11 +4256,11 @@ DEF_OP(VFMLS) { } } else { if (IROp->ElementSize == OpSize) { - if (IROp->ElementSize == 2) { + if (IROp->ElementSize == IR::OpSize::i16Bit) { fnmsub(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); - } else if (IROp->ElementSize == 4) { + } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fnmsub(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); - } else if (IROp->ElementSize == 8) { + } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fnmsub(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; @@ -4299,7 +4304,7 @@ DEF_OP(VFNMLA) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); @@ -4325,11 +4330,11 @@ DEF_OP(VFNMLA) { } } else { if (IROp->ElementSize == OpSize) { - if (IROp->ElementSize == 2) { + if (IROp->ElementSize == IR::OpSize::i16Bit) { fmsub(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); - } else if (IROp->ElementSize == 4) { + } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fmsub(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); - } else if (IROp->ElementSize == 8) { + } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fmsub(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; @@ -4344,7 +4349,7 @@ DEF_OP(VFNMLA) { } mov(DestTmp.Q(), VectorAddend.Q()); } - if (OpSize == 16) { + if (OpSize == IR::OpSize::i128Bit) { fmls(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmls(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); @@ -4367,10 +4372,10 @@ DEF_OP(VFNMLS) { const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); - const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || (Is256Bit && HostSupportsSVE256), "Need SVE256 support in order to use {} with 256-bit operation", __func__); - const auto Is128Bit = OpSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1.ID()); @@ -4411,11 +4416,11 @@ DEF_OP(VFNMLS) { } } else { if (IROp->ElementSize == OpSize) { - if (IROp->ElementSize == 2) { + if (IROp->ElementSize == IR::OpSize::i16Bit) { fnmadd(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); - } else if (IROp->ElementSize == 4) { + } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fnmadd(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); - } else if (IROp->ElementSize == 8) { + } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fnmadd(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index e27e184db1..fab35fb0b1 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -168,7 +168,7 @@ void OpDispatchBuilder::RETOp(OpcodeArgs) { if (Op->OP == 0xC2) { auto Offset = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags); - SP = _Add(IR::SizeToOpSize(GPRSize), SP, Offset); + SP = _Add(GPRSize, SP, Offset); } // Store the new stack pointer @@ -297,7 +297,7 @@ void OpDispatchBuilder::ADCOp(OpcodeArgs, uint32_t SrcIndex) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); - Before = _AtomicFetchAdd(IR::SizeToOpSize(Size), ALUOp, DestMem); + Before = _AtomicFetchAdd(Size, ALUOp, DestMem); } else { Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } @@ -334,7 +334,7 @@ void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) { Ref DestMem = MakeSegmentAddress(Op, Op->Dest); auto SrcPlusCF = IncrementByCarry(OpSize, Src); - Before = _AtomicFetchSub(IR::SizeToOpSize(Size), SrcPlusCF, DestMem); + Before = _AtomicFetchSub(Size, SrcPlusCF, DestMem); } else { Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } @@ -494,7 +494,7 @@ void OpDispatchBuilder::POPAOp(OpcodeArgs) { StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size); // Skip loading RSP because it'll be correct at the end - SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(Size))); + SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(IR::OpSizeToSize(Size)))); StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size); @@ -567,7 +567,7 @@ void OpDispatchBuilder::CALLOp(OpcodeArgs) { uint64_t InstRIP = Op->PC + Op->InstSize; uint64_t TargetRIP = InstRIP + TargetOffset; - Ref NewRIP = _Add(IR::SizeToOpSize(GPRSize), ConstantPC, _Constant(TargetOffset)); + Ref NewRIP = _Add(GPRSize, ConstantPC, _Constant(TargetOffset)); // Push the return address. Push(GPRSize, ConstantPC); @@ -715,7 +715,7 @@ void OpDispatchBuilder::CMOVOp(OpcodeArgs) { Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags); } - auto SrcCond = SelectCC(Op->OP & 0xF, IR::SizeToOpSize(std::max(OpSize::i32Bit, GetSrcSize(Op))), Src, Dest); + auto SrcCond = SelectCC(Op->OP & 0xF, std::max(OpSize::i32Bit, OpSizeFromSrc(Op)), Src, Dest); StoreResult(GPRClass, Op, SrcCond, OpSize::iInvalid); } @@ -731,7 +731,7 @@ void OpDispatchBuilder::CondJUMPOp(OpcodeArgs) { uint64_t InstRIP = Op->PC + Op->InstSize; uint64_t Target = InstRIP + TargetOffset; - if (CTX->GetGPRSize() == OpSize::i32Bit) { + if (CTX->GetGPROpSize() == OpSize::i32Bit) { // If the GPRSize is 4 then we need to be careful about PC wrapping if (TargetOffset < 0 && -TargetOffset > InstRIP) { // Invert the signed value if we are underflowing @@ -802,7 +802,7 @@ void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) { BlockSetRIP = true; auto JcxGPRSize = CTX->GetGPROpSize(); - JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (IR::DivideOpSize(JcxGPRSize, 2)) : JcxGPRSize; + JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize; uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal(); @@ -937,7 +937,7 @@ void OpDispatchBuilder::JUMPOp(OpcodeArgs) { uint64_t InstRIP = Op->PC + Op->InstSize; uint64_t TargetRIP = InstRIP + TargetOffset; - if (CTX->GetGPRSize() == OpSize::i32Bit) { + if (CTX->GetGPROpSize() == OpSize::i32Bit) { // If the GPRSize is 4 then we need to be careful about PC wrapping if (TargetOffset < 0 && -TargetOffset > InstRIP) { // Invert the signed value if we are underflowing @@ -1000,18 +1000,18 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) { Ref Src = LoadSource(GPRClass, Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true}); Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); - auto Size = GetDstSize(Op); + const auto Size = OpSizeFromDst(Op); uint64_t Const; bool AlwaysNonnegative = false; if (IsValueConstant(WrapNode(Src), &Const)) { // Optimize out masking constants - if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << Size * 8) - 1))) { + if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(Size)) - 1))) { Src = Dest; } // Optimize test with non-sign bits - AlwaysNonnegative = (Const & (1ull << ((Size * 8) - 1))) == 0; + AlwaysNonnegative = (Const & (1ull << (IR::OpSizeAsBits(Size) - 1))) == 0; } if (Dest == Src) { @@ -1024,7 +1024,7 @@ void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) { SetNZ_ZeroCV(OpSize::i32Bit, Res); } else { HandleNZ00Write(); - CalculatePF(_AndWithFlags(IR::SizeToOpSize(Size), Dest, Src)); + CalculatePF(_AndWithFlags(Size, Dest, Src)); } InvalidateAF(); @@ -1049,7 +1049,7 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) { StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, Size, OpSize::iInvalid); } else if (Sext) { // With REX.W then Sext - Src = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src); + Src = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Src); StoreResult(GPRClass, Op, Src, OpSize::iInvalid); } else { // Without REX.W then Zext (store result implicitly zero extends) @@ -1059,13 +1059,13 @@ void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) { void OpDispatchBuilder::MOVSXOp(OpcodeArgs) { // Load garbage in upper bits, since we're sign extending anyway - uint8_t Size = GetSrcSize(Op); + const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // Sign-extend to DstSize and zero-extend to the register size, using a fast // path for 32-bit dests where the native 32-bit Sbfe zero extends the top. - uint8_t DstSize = GetDstSize(Op); - Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, Size * 8, 0, Src); + const auto DstSize = OpSizeFromDst(Op); + Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Src); StoreResult(GPRClass, Op, Op->Dest, Src, OpSize::iInvalid); } @@ -1134,10 +1134,10 @@ void OpDispatchBuilder::XCHGOp(OpcodeArgs) { void OpDispatchBuilder::CDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) >> 1); + const auto SrcSize = DstSize / 2; Ref Src = LoadGPRRegister(X86State::REG_RAX, SrcSize, 0, true); - Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, SrcSize * 8, 0, Src); + Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, IR::OpSizeAsBits(SrcSize), 0, Src); StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, DstSize, OpSize::iInvalid); } @@ -1374,7 +1374,7 @@ void OpDispatchBuilder::XGetBVOp(OpcodeArgs) { } void OpDispatchBuilder::SHLOp(OpcodeArgs) { - const auto Size = GetSrcSize(Op); + const auto Size = OpSizeFromSrc(Op); auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); @@ -1398,7 +1398,7 @@ void OpDispatchBuilder::SHLImmediateOp(OpcodeArgs, bool SHL1Bit) { void OpDispatchBuilder::SHROp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); - auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 4}); + auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= OpSize::i32Bit}); auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto ALUOp = _Lshr(std::max(OpSize::i32Bit, Size), Dest, Src); @@ -1557,29 +1557,29 @@ void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) { } void OpDispatchBuilder::ASHROp(OpcodeArgs, bool Immediate, bool SHR1Bit) { - const auto Size = GetSrcSize(Op); + const auto Size = OpSizeFromSrc(Op); const auto OpSize = std::max(OpSize::i32Bit, OpSizeFromDst(Op)); // If Size < 4, then we Sbfe the Dest so we can have garbage. // Otherwise, if Size = Opsize, then both are 4 or 8 and match the a64 // semantics directly, so again we can have garbage. The only case where we // need zero-extension here is when the sizes mismatch. - auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < 4)}); + auto Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < OpSize::i32Bit)}); if (Size < OpSize::i32Bit) { - Dest = _Sbfe(OpSize::i64Bit, Size * 8, 0, Dest); + Dest = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Dest); } if (Immediate) { uint64_t Shift = LoadConstantShift(Op, SHR1Bit); - Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, _Constant(Shift)); + Ref Result = _Ashr(OpSize, Dest, _Constant(Shift)); CalculateFlags_SignShiftRightImmediate(OpSizeFromSrc(Op), Result, Dest, Shift); CalculateDeferredFlags(); StoreResult(GPRClass, Op, Result, OpSize::iInvalid); } else { auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); - Ref Result = _Ashr(IR::SizeToOpSize(OpSize), Dest, Src); + Ref Result = _Ashr(OpSize, Dest, Src); HandleShift(Op, Result, Dest, ShiftType::ASR, Src); } @@ -1660,12 +1660,12 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) { // Essentially (Src1 >> Start) & ((1 << Length) - 1) // along with some edge-case handling and flag setting. - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); - const auto SrcSize = Size * 8; + const auto SrcSize = IR::OpSizeAsBits(Size); const auto MaxSrcBit = SrcSize - 1; auto MaxSrcBitOp = _Constant(Size, MaxSrcBit); @@ -1701,8 +1701,8 @@ void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) { void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) { // Equivalent to performing: SRC & -SRC - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); - auto Size = OpSizeFromSrc(Op); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); + const auto Size = OpSizeFromSrc(Op); auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto NegatedSrc = _Neg(Size, Src); @@ -1715,15 +1715,15 @@ void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) { // inverted ZF. // // ZF/SF/OF set as usual. - SetNZ_ZeroCV(GetSrcSize(Op), Result); + SetNZ_ZeroCV(Size, Result); InvalidatePF_AF(); SetCFInverted(GetRFLAG(X86State::RFLAG_ZF_RAW_LOC)); } void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) { // Equivalent to: (Src - 1) ^ Src - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); - auto Size = OpSizeFromSrc(Op); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); + const auto Size = OpSizeFromSrc(Op); auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _Xor(Size, _Sub(Size, Src, _InlineConstant(1)), Src); @@ -1738,24 +1738,25 @@ void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) { // The output of BLSMSK is always nonzero, so TST will clear Z (along with C // and O) while setting S. - SetNZ_ZeroCV(GetSrcSize(Op), Result); + SetNZ_ZeroCV(Size, Result); SetCFInverted(CFInv); } void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) { // Equivalent to: (Src - 1) & Src - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); - auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); - auto Size = OpSizeFromSrc(Op); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); + const auto Size = OpSizeFromSrc(Op); + auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _And(Size, _Sub(Size, Src, _InlineConstant(1)), Src); + StoreResult(GPRClass, Op, Result, OpSize::iInvalid); auto Zero = _Constant(0); auto One = _Constant(1); auto CFInv = _Select(IR::COND_NEQ, Src, Zero, One, Zero); - SetNZ_ZeroCV(GetSrcSize(Op), Result); + SetNZ_ZeroCV(Size, Result); SetCFInverted(CFInv); InvalidatePF_AF(); } @@ -1774,13 +1775,13 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) { Ref Result; if (Op->OP == 0x6F7) { // SARX - Result = _Ashr(IR::SizeToOpSize(Size), Src, Shift); + Result = _Ashr(Size, Src, Shift); } else if (Op->OP == 0x5F7) { // SHLX - Result = _Lshl(IR::SizeToOpSize(Size), Src, Shift); + Result = _Lshl(Size, Src, Shift); } else { // SHRX - Result = _Lshr(IR::SizeToOpSize(Size), Src, Shift); + Result = _Lshr(Size, Src, Shift); } StoreResult(GPRClass, Op, Result, OpSize::iInvalid); @@ -1788,7 +1789,7 @@ void OpDispatchBuilder::BMI2Shift(OpcodeArgs) { void OpDispatchBuilder::BZHI(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); - const auto OperandSize = Size * 8; + const auto OperandSize = IR::OpSizeAsBits(Size); // In 32-bit mode we only look at bottom 32-bit, no 8 or 16-bit BZHI so no // need to zero-extend sources @@ -1853,13 +1854,12 @@ void OpDispatchBuilder::RORX(OpcodeArgs) { void OpDispatchBuilder::MULX(OpcodeArgs) { // RDX is the implied source operand in the instruction - const auto OperandSize = OpSizeFromSrc(Op); - const auto OpSize = IR::SizeToOpSize(OperandSize); + const auto OpSize = OpSizeFromSrc(Op); // Src1 can be a memory operand, so ensure we constrain to the // absolute width of the access in that scenario. const auto GPRSize = CTX->GetGPROpSize(); - const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OperandSize; + const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OpSize; Ref Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], Src1Size, Op->Flags); Ref Src2 = LoadGPRRegister(X86State::REG_RDX, GPRSize); @@ -1880,7 +1880,7 @@ void OpDispatchBuilder::MULX(OpcodeArgs) { } void OpDispatchBuilder::PDEP(OpcodeArgs) { - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask); @@ -1889,7 +1889,7 @@ void OpDispatchBuilder::PDEP(OpcodeArgs) { } void OpDispatchBuilder::PEXT(OpcodeArgs) { - LOGMAN_THROW_A_FMT(Op->InstSize >= OpSize::i32Bit, "No masking needed"); + LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _PExt(OpSizeFromSrc(Op), Input, Mask); @@ -2093,7 +2093,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) { StoreResult(GPRClass, Op, Res, OpSize::iInvalid); }, - GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); + OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); } void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) { @@ -2315,7 +2315,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) { StoreResult(GPRClass, Op, Res, OpSize::iInvalid); }, - GetSrcSize(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); + OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); } void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) { @@ -2405,7 +2405,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) { // Get the bit selection from the src. We need to mask for 8/16-bit, but // rely on the implicit masking of Lshr for native sizes. - unsigned LshrSize = std::max(OpSize::i32Bit, Size / 8); + unsigned LshrSize = std::max(IR::OpSizeToSize(OpSize::i32Bit), Size / 8); auto BitSelect = (Size == (LshrSize * 8)) ? Src : _And(OpSize::i64Bit, Src, _Constant(Mask)); // OF/SF/ZF/AF/PF undefined. @@ -2458,7 +2458,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) { // Load the address to the memory location Ref Dest = MakeSegmentAddress(Op, Op->Dest); // Get the bit selection from the src - Ref BitSelect = _Bfe(IR::SizeToOpSize(std::max(4u, GetOpSize(Src))), 3, 0, Src); + Ref BitSelect = _Bfe(std::max(OpSize::i32Bit, GetOpSize(Src)), 3, 0, Src); // Address is provided as bits we want BYTE offsets // Extract Signed offset @@ -2523,7 +2523,7 @@ void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) { } // Now shift in to the correct bit location - Value = _Lshr(IR::SizeToOpSize(std::max(4u, GetOpSize(Value))), Value, BitSelect); + Value = _Lshr(std::max(OpSize::i32Bit, GetOpSize(Value)), Value, BitSelect); // OF/SF/ZF/AF/PF undefined. SetCFDirect(Value, ConstantShift, true); @@ -2536,21 +2536,22 @@ void OpDispatchBuilder::IMUL1SrcOp(OpcodeArgs) { Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); + const auto SizeBits = IR::OpSizeAsBits(Size); Ref Dest {}; Ref ResultHigh {}; switch (Size) { case OpSize::i8Bit: case OpSize::i16Bit: { - Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1); - Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2); + Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); + Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2); Dest = _Mul(OpSize::i64Bit, Src1, Src2); - ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest); + ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest); break; } case OpSize::i32Bit: { ResultHigh = _SMull(Src1, Src2); - ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh); + ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh); // Flipped order to save a move Dest = _Mul(OpSize::i32Bit, Src1, Src2); break; @@ -2573,6 +2574,7 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) { Ref Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); + const auto SizeBits = IR::OpSizeAsBits(Size); Ref Dest {}; Ref ResultHigh {}; @@ -2580,15 +2582,15 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) { switch (Size) { case OpSize::i8Bit: case OpSize::i16Bit: { - Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1); - Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2); + Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); + Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2); Dest = _Mul(OpSize::i64Bit, Src1, Src2); - ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, Dest); + ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest); break; } case OpSize::i32Bit: { ResultHigh = _SMull(Src1, Src2); - ResultHigh = _Sbfe(OpSize::i64Bit, Size * 8, Size * 8, ResultHigh); + ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh); // Flipped order to save a move Dest = _Mul(OpSize::i32Bit, Src1, Src2); break; @@ -2608,13 +2610,14 @@ void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) { void OpDispatchBuilder::IMULOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); + const auto SizeBits = IR::OpSizeAsBits(Size); Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadGPRRegister(X86State::REG_RAX); if (Size != OpSize::i64Bit) { - Src1 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src1); - Src2 = _Sbfe(OpSize::i64Bit, Size * 8, 0, Src2); + Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); + Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2); } // 64-bit special cased to save a move @@ -2659,14 +2662,15 @@ void OpDispatchBuilder::IMULOp(OpcodeArgs) { void OpDispatchBuilder::MULOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); + const auto SizeBits = IR::OpSizeAsBits(Size); Ref Src1 = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadGPRRegister(X86State::REG_RAX); Ref Result; if (Size != OpSize::i64Bit) { - Src1 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src1); - Src2 = _Bfe(OpSize::i64Bit, Size * 8, 0, Src2); + Src1 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src1); + Src2 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src2); Result = _UMul(OpSize::i64Bit, Src1, Src2); } Ref ResultHigh {}; @@ -2709,17 +2713,19 @@ void OpDispatchBuilder::MULOp(OpcodeArgs) { void OpDispatchBuilder::NOTOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); + const auto SizeBits = IR::OpSizeAsBits(Size); + Ref MaskConst {}; if (Size == OpSize::i64Bit) { MaskConst = _Constant(~0ULL); } else { - MaskConst = _Constant((1ULL << (Size * 8)) - 1); + MaskConst = _Constant((1ULL << SizeBits) - 1); } if (DestIsLockedMem(Op)) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); - _AtomicXor(IR::SizeToOpSize(Size), MaskConst, DestMem); + _AtomicXor(Size, MaskConst, DestMem); } else if (!Op->Dest.IsGPR()) { // GPR version plays fast and loose with sizes, be safe for memory tho. Ref Src = LoadSource(GPRClass, Op, Op->Dest, Op->Flags); @@ -2742,13 +2748,13 @@ void OpDispatchBuilder::NOTOp(OpcodeArgs) { // For 8/16-bit, use 64-bit invert so we invert in place, while getting // insert behaviour. For 32-bit, use 32-bit invert to zero the upper bits. - unsigned EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize; + const auto EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize; // If we're inverting the whole thing, use Not instead of Xor to save a constant. if (Size >= OpSize::i32Bit) { - Src = _Not(IR::SizeToOpSize(EffectiveSize), Src); + Src = _Not(EffectiveSize, Src); } else { - Src = _Xor(IR::SizeToOpSize(EffectiveSize), Src, MaskConst); + Src = _Xor(EffectiveSize, Src, MaskConst); } // Always store 64-bit, the Not/Xor correctly handle the upper bits and this @@ -2816,7 +2822,7 @@ void OpDispatchBuilder::DAAOp(OpcodeArgs) { // SF, ZF, PF set according to result. CF set per above. OF undefined. StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit); - SetNZ_ZeroCV(1, AL); + SetNZ_ZeroCV(OpSize::i8Bit, AL); SetCFInverted(CFInv); CalculatePF(AL); SetAFAndFixup(AF); @@ -2842,7 +2848,7 @@ void OpDispatchBuilder::DASOp(OpcodeArgs) { // SF, ZF, PF set according to result. CF set per above. OF undefined. StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit); - SetNZ_ZeroCV(1, AL); + SetNZ_ZeroCV(OpSize::i8Bit, AL); SetCFDirect(NewCF); CalculatePF(AL); SetAFAndFixup(AF); @@ -2898,7 +2904,7 @@ void OpDispatchBuilder::AAMOp(OpcodeArgs) { auto Res = _AddShift(OpSize::i64Bit, URemOp, UDivOp, ShiftType::LSL, 8); StoreGPRRegister(X86State::REG_RAX, Res, OpSize::i16Bit); - SetNZ_ZeroCV(1, Res); + SetNZ_ZeroCV(OpSize::i8Bit, Res); CalculatePF(Res); InvalidateAF(); } @@ -2913,7 +2919,7 @@ void OpDispatchBuilder::AADOp(OpcodeArgs) { auto Result = _And(OpSize::i64Bit, NewAL, _Constant(0xFF)); StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit); - SetNZ_ZeroCV(1, Result); + SetNZ_ZeroCV(OpSize::i8Bit, Result); CalculatePF(Result); InvalidateAF(); } @@ -2978,14 +2984,14 @@ void OpDispatchBuilder::EnterOp(OpcodeArgs) { if (Level > 0) { for (uint8_t i = 1; i < Level; ++i) { - auto Offset = _Constant(i * GPRSize); - auto MemLoc = _Sub(IR::SizeToOpSize(GPRSize), OldBP, Offset); + auto Offset = _Constant(i * IR::OpSizeToSize(GPRSize)); + auto MemLoc = _Sub(GPRSize, OldBP, Offset); auto Mem = _LoadMem(GPRClass, GPRSize, MemLoc, GPRSize); NewSP = PushValue(GPRSize, Mem); } NewSP = PushValue(GPRSize, temp_RBP); } - NewSP = _Sub(IR::SizeToOpSize(GPRSize), NewSP, _Constant(AllocSpace)); + NewSP = _Sub(GPRSize, NewSP, _Constant(AllocSpace)); StoreGPRRegister(X86State::REG_RSP, NewSP); StoreGPRRegister(X86State::REG_RBP, temp_RBP); } @@ -3186,7 +3192,7 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) { // Offset the pointer Ref TailDest = LoadGPRRegister(X86State::REG_RDI); - StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, Size)); + StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, IR::OpSizeToSize(Size))); } else { // FEX doesn't support partial faulting REP instructions. // Converting this to a `MemSet` IR op optimizes this quite significantly in our codegen. @@ -3255,7 +3261,7 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) { // Store to memory where RDI points _StoreMemAutoTSO(GPRClass, Size, RDI, Src, Size); - auto PtrDir = LoadDir(Size); + auto PtrDir = LoadDir(IR::OpSizeToSize(Size)); RSI = _Add(OpSize::i64Bit, RSI, PtrDir); RDI = _Add(OpSize::i64Bit, RDI, PtrDir); @@ -3285,7 +3291,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { CalculateFlags_SUB(OpSizeFromSrc(Op), Src2, Src1); - auto PtrDir = LoadDir(Size); + auto PtrDir = LoadDir(IR::OpSizeToSize(Size)); // Offset the pointer Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir); @@ -3342,11 +3348,11 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { StoreGPRRegister(X86State::REG_RCX, TailCounter); // Offset the pointer - Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size)); + Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RDI, Dest_RDI); // Offset second pointer - Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size)); + Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RSI, Dest_RSI); // If TailCounter != 0, compare sources. @@ -3403,7 +3409,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) { // Offset the pointer Ref TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); - StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, Size)); + StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, IR::OpSizeToSize(Size))); } else { // Calculate flags early. because end of block CalculateDeferredFlags(); @@ -3452,7 +3458,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) { StoreGPRRegister(X86State::REG_RCX, TailCounter); // Offset the pointer - TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size)); + TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); // Jump back to the start, we have more work to do @@ -3487,7 +3493,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) { // Offset the pointer Ref TailDest_RDI = LoadGPRRegister(X86State::REG_RDI); - StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, Size)); + StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, IR::OpSizeToSize(Size))); } else { // Calculate flags early. because end of block CalculateDeferredFlags(); @@ -3536,7 +3542,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) { StoreGPRRegister(X86State::REG_RCX, TailCounter); // Offset the pointer - TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size)); + TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); CalculateDeferredFlags(); @@ -3598,7 +3604,7 @@ void OpDispatchBuilder::NEGOp(OpcodeArgs) { if (DestIsLockedMem(Op)) { Ref DestMem = MakeSegmentAddress(Op, Op->Dest); - Ref Dest = _AtomicFetchNeg(IR::SizeToOpSize(Size), DestMem); + Ref Dest = _AtomicFetchNeg(Size, DestMem); CalculateFlags_SUB(Size, ZeroConst, Dest); } else { Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); @@ -3622,7 +3628,7 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) { auto URemOp = _URem(OpSize::i16Bit, Src1, Divisor); // AX[15:0] = concat - auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp); + auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else if (Size == OpSize::i16Bit) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); @@ -3636,8 +3642,8 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); - Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor)); - Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor)); + Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor)); + Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor)); StoreGPRRegister(X86State::REG_RAX, UDivOp); StoreGPRRegister(X86State::REG_RDX, URemOp); @@ -3674,7 +3680,7 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) { auto URemOp = _Rem(OpSize::i64Bit, Src1, Divisor); // AX[15:0] = concat - auto ResultAX = _Bfi(IR::SizeToOpSize(GPRSize), 8, 8, UDivOp, URemOp); + auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else if (Size == OpSize::i16Bit) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); @@ -3688,8 +3694,8 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); - Ref UDivOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor)); - Ref URemOp = _Bfe(OpSize::i32Bit, Size * 8, 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor)); + Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor)); + Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor)); StoreGPRRegister(X86State::REG_RAX, UDivOp); StoreGPRRegister(X86State::REG_RDX, URemOp); @@ -3728,7 +3734,7 @@ void OpDispatchBuilder::BSFOp(OpcodeArgs) { // Although Intel does not guarantee that semantic, AMD does and Intel // hardware satisfies it. We provide the stronger AMD behaviour as // applications might rely on that in the wild. - auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result); + auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result); StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid); } @@ -3746,7 +3752,7 @@ void OpDispatchBuilder::BSROp(OpcodeArgs) { SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src); // If Src was zero then the destination doesn't get modified - auto SelectOp = NZCVSelect(IR::SizeToOpSize(GPRSize), {COND_EQ}, Dest, Result); + auto SelectOp = NZCVSelect(GPRSize, {COND_EQ}, Dest, Result); StoreResult_WithOpSize(GPRClass, Op, Op->Dest, SelectOp, DstSize, OpSize::iInvalid); } @@ -3784,7 +3790,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags, {.AllowUpperGarbage = true}); - Src1Lower = _Bfe(IR::SizeToOpSize(GPRSize), Size * 8, 0, Src1); + Src1Lower = _Bfe(GPRSize, IR::OpSizeAsBits(Size), 0, Src1); } else { Src1 = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true}); Src1Lower = Src1; @@ -3797,7 +3803,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { if (!Trivial) { if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { // This allows us to only hit the ZEXT case on failure - Ref RAXResult = NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, Src1Lower); + Ref RAXResult = NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, Src1Lower); // When the size is 4 we need to make sure not zext the GPR when the comparison fails StoreGPRRegister(X86State::REG_RAX, RAXResult); @@ -3809,7 +3815,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { // Op1 = RAX == Op1 ? Op2 : Op1 // If they match then set the rm operand to the input // else don't set the rm operand - Ref DestResult = Trivial ? Src2 : NZCVSelect(IR::i64Bit, CondClassType {COND_EQ}, Src2, Src1); + Ref DestResult = Trivial ? Src2 : NZCVSelect(OpSize::i64Bit, CondClassType {COND_EQ}, Src2, Src1); // Store in to GPR Dest if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { @@ -3837,7 +3843,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { // if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc // This will write to memory! Careful! // Third operand must be a calculated guest memory address - Ref CASResult = _CAS(IR::SizeToOpSize(Size), Src3Lower, Src2, Src1); + Ref CASResult = _CAS(Size, Src3Lower, Src2, Src1); Ref RAXResult = CASResult; CalculateFlags_SUB(OpSizeFromSrc(Op), Src3Lower, CASResult); @@ -3845,7 +3851,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { // This allows us to only hit the ZEXT case on failure - RAXResult = _NZCVSelect(IR::i64Bit, {COND_EQ}, Src3, CASResult); + RAXResult = _NZCVSelect(OpSize::i64Bit, {COND_EQ}, Src3, CASResult); Size = OpSize::i64Bit; } @@ -3885,10 +3891,10 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) { Ref Result_Lower = _AllocateGPR(true); Ref Result_Upper = _AllocateGPRAfter(Result_Lower); - _CASPair(IR::SizeToOpSize(Size), Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper); + _CASPair(Size, Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper); HandleNZCV_RMW(); - _CmpPairZ(IR::SizeToOpSize(Size), Result_Lower, Result_Upper, Expected_Lower, Expected_Upper); + _CmpPairZ(Size, Result_Lower, Result_Upper, Expected_Lower, Expected_Upper); CalculateDeferredFlags(); auto UpdateIfNotZF = [this](auto Reg, auto Value) { @@ -4020,7 +4026,7 @@ Ref OpDispatchBuilder::GetSegment(uint32_t Flags, uint32_t DefaultPrefix, bool O Ref OpDispatchBuilder::AppendSegmentOffset(Ref Value, uint32_t Flags, uint32_t DefaultPrefix, bool Override) { auto Segment = GetSegment(Flags, DefaultPrefix, Override); if (Segment) { - Value = _Add(IR::SizeToOpSize(std::max(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment)))), Value, Segment); + Value = _Add(std::max(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment))), Value, Segment); } return Value; @@ -4144,7 +4150,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, if (A.Offset) { Ref Offset = _Constant(A.Offset); - Tmp = Tmp ? _Add(IR::SizeToOpSize(GPRSize), Tmp, Offset) : Offset; + Tmp = Tmp ? _Add(GPRSize, Tmp, Offset) : Offset; } if (A.Index) { @@ -4167,7 +4173,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, // // If the AddrSize is not the GPRSize then we need to clear the upper bits. if ((A.AddrSize < GPRSize) && !AllowUpperGarbage && Tmp) { - Tmp = _Bfe(GPRSize, A.AddrSize * 8, 0, Tmp); + Tmp = _Bfe(GPRSize, IR::OpSizeAsBits(A.AddrSize), 0, Tmp); } if (A.Segment && AddSegmentBase) { @@ -4177,7 +4183,7 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, return Tmp ?: _Constant(0); } -AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize) { +AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) { const auto GPRSize = CTX->GetGPROpSize(); // In the future this also needs to account for LRCPC3. @@ -4207,9 +4213,10 @@ AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, } // Try a (possibly scaled) register index. - if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSize)) { + if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset && + (A.IndexScale == 1 || A.IndexScale == IR::OpSizeToSize(AccessSize))) { if (A.Index && A.Segment) { - A.Base = _Add(IR::SizeToOpSize(GPRSize), A.Base, A.Segment); + A.Base = _Add(GPRSize, A.Base, A.Segment); } else if (A.Segment) { A.Index = A.Segment; A.IndexScale = 1; @@ -4231,7 +4238,7 @@ AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, con AddressMode A {}; A.Segment = GetSegment(Op->Flags); - A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (IR::DivideOpSize(GPRSize, 2)) : GPRSize; + A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize; A.NonTSO = AccessType == MemoryAccessType::NONTSO || AccessType == MemoryAccessType::STREAM; if (Operand.IsLiteral()) { @@ -4312,7 +4319,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T // Now extract the subregister if it was a partial load /smaller/ than SSE size // TODO: Instead of doing the VMov implicitly on load, hunt down all use cases that require partial loads and do it after load. // We don't have information here to know if the operation needs zero upper bits or can contain data. - if (!AllowUpperGarbage && OpSize < Core::CPUState::XMM_SSE_REG_SIZE) { + if (!AllowUpperGarbage && OpSize < OpSize::i128Bit) { A.Base = _VMov(OpSize, A.Base); } } else { @@ -4345,7 +4352,7 @@ Ref OpDispatchBuilder::LoadGPRRegister(uint32_t GPR, IR::OpSize Size, uint8_t Of if (AllowUpperGarbage) { Reg = _Lshr(OpSize, Reg, _Constant(Offset)); } else { - Reg = _Bfe(OpSize, Size * 8, Offset, Reg); + Reg = _Bfe(OpSize, IR::OpSizeAsBits(Size), Offset, Reg); } } return Reg; @@ -4360,7 +4367,7 @@ void OpDispatchBuilder::StoreGPRRegister(uint32_t GPR, const Ref Src, IR::OpSize Ref Reg = Src; if (Size != GPRSize || Offset != 0) { // Need to do an insert if not automatic size or zero offset. - Reg = _Bfi(GPRSize, Size * 8, Offset, LoadGPRRegister(GPR), Src); + Reg = _Bfi(GPRSize, IR::OpSizeAsBits(Size), Offset, LoadGPRRegister(GPR), Src); } StoreRegister(GPR, false, Reg); @@ -4408,7 +4415,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl LOGMAN_THROW_A_FMT(Class != IR::GPRClass, "Partial writes from GPR not allowed. Instruction: {}", Op->TableInfo->Name); // XMM-size is handled in implementations. - if (VectorSize != Core::CPUState::XMM_AVX_REG_SIZE || OpSize != Core::CPUState::XMM_SSE_REG_SIZE) { + if (VectorSize != OpSize::i256Bit || OpSize != OpSize::i128Bit) { auto SrcVector = LoadXMMRegister(gprIndex); Result = _VInsElement(VectorSize, OpSize, 0, 0, SrcVector, Src); } @@ -4443,7 +4450,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */); - if (OpSize == 10) { + if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); // For X87 extended doubles, split before storing @@ -4547,7 +4554,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I (ALUIROp == IR::IROps::OP_XOR || ALUIROp == IR::IROps::OP_OR || ALUIROp == IR::IROps::OP_ANDWITHFLAGS)) { RoundedSize = ResultSize = CTX->GetGPROpSize(); - LOGMAN_THROW_A_FMT(Const < (1ull << (Size * 8)), "does not clobber"); + LOGMAN_THROW_A_FMT(Const < (1ull << IR::OpSizeAsBits(Size)), "does not clobber"); // For AND, we can play the same trick but we instead need the upper bits of // the constant to be all-1s instead of all-0s to preserve. We also can't @@ -4559,7 +4566,7 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I // adjusted constant here will inline into the arm64 and instruction, so if // flags are not needed, we save an instruction overall. if (ALUIROp == IR::IROps::OP_ANDWITHFLAGS) { - Src = _Constant(Const | ~((1ull << (Size * 8)) - 1)); + Src = _Constant(Const | ~((1ull << IR::OpSizeAsBits(Size)) - 1)); ALUIROp = IR::IROps::OP_AND; } } @@ -4570,13 +4577,13 @@ void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::I if (DestIsLockedMem(Op)) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); - DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(IR::SizeToOpSize(Size), Src, DestMem)); + DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(Size, Src, DestMem)); Dest = FetchOp; } else { Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } - const auto OpSize = IR::SizeToOpSize(RoundedSize); + const auto OpSize = RoundedSize; DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src)); Result = ALUOp; @@ -4756,7 +4763,7 @@ void OpDispatchBuilder::MOVBEOp(OpcodeArgs) { // Rev of 16-bit value as 32-bit replaces the result in the upper 16-bits of the result. // bfxil the 16-bit result in to the GPR. Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, GPRSize, Op->Flags); - auto Result = _Bfxil(IR::SizeToOpSize(GPRSize), 16, 16, Dest, Src); + auto Result = _Bfxil(GPRSize, 16, 16, Dest, Src); StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, OpSize::iInvalid); } else { // 32-bit does regular zext diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 716dfc7a19..118cd10bf1 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -938,12 +938,12 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VectorALU(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper); - void AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize, + void AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper); void AVX128_VectorShiftWideImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp); void AVX128_VectorShiftImmImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp); - void AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3, - std::function Helper); + void AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3, + std::function Helper); enum class ShiftDirection { RIGHT, LEFT }; void AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir); @@ -993,7 +993,7 @@ class OpDispatchBuilder final : public IREmitter { template void AVX128_PExtr(OpcodeArgs); void AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed); - template + template void AVX128_MOVMSK(OpcodeArgs); void AVX128_MOVMSKB(OpcodeArgs); void AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, @@ -1065,7 +1065,7 @@ class OpDispatchBuilder final : public IREmitter { template void AVX128_VSHUF(OpcodeArgs); - template + template void AVX128_VPERMILImm(OpcodeArgs); template @@ -1137,7 +1137,7 @@ class OpDispatchBuilder final : public IREmitter { void StoreResult_WithAVXInsert(VectorOpType Type, FEXCore::IR::RegisterClassType Class, FEXCore::X86Tables::DecodedOp Op, Ref Value, IR::OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 && - GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) { + GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) { const auto gpr = Op->Dest.Data.GPR.GPR; const auto gprIndex = gpr - X86State::REG_XMM_0; auto DestVector = LoadXMMRegister(gprIndex); @@ -1150,7 +1150,7 @@ class OpDispatchBuilder final : public IREmitter { } void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) { - if (GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) { + if (GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) { ///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register. auto DestVector = LoadXMMRegister(XMM); Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value); @@ -1233,12 +1233,14 @@ class OpDispatchBuilder final : public IREmitter { // Use stp where possible to store multiple values at a time. This accelerates AVX. // TODO: this is all really confusing because of backwards iteration, // can we peel back that hack? - if ((Bits & NextBit) && !Partial && Size >= 4 && CacheIndexToContextOffset(Index - 1) == Offset - Size && (Offset - Size) / Size < 64) { + const auto SizeInt = IR::OpSizeToSize(Size); + if ((Bits & NextBit) && !Partial && Size >= OpSize::i32Bit && CacheIndexToContextOffset(Index - 1) == Offset - SizeInt && + (Offset - SizeInt) / SizeInt < 64) { LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction"); - LOGMAN_THROW_A_FMT((Offset % Size) == 0, "construction"); + LOGMAN_THROW_A_FMT((Offset % SizeInt) == 0, "construction"); Ref ValueNext = RegCache.Value[Index - 1]; - _StoreContextPair(Size, Class, ValueNext, Value, Offset - Size); + _StoreContextPair(Size, Class, ValueNext, Value, Offset - SizeInt); Bits &= ~NextBit; } else { _StoreContext(Size, Class, Value, Offset); @@ -1380,7 +1382,7 @@ class OpDispatchBuilder final : public IREmitter { Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm); - Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select); + Ref MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select); Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm, bool IsAVX); @@ -1503,7 +1505,7 @@ class OpDispatchBuilder final : public IREmitter { Ref GetRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset = 0); Ref LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, bool AllowUpperGarbage = false); - AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, unsigned AccessSize); + AddressMode SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize); bool IsOperandMem(const X86Tables::DecodedOperand& Operand, bool Load) { // Literals are immediates as sources but memory addresses as destinations. @@ -1627,24 +1629,24 @@ class OpDispatchBuilder final : public IREmitter { NZCVDirty = true; } - void SetNZ_ZeroCV(unsigned SrcSize, Ref Res, bool SetPF = false) { + void SetNZ_ZeroCV(IR::OpSize SrcSize, Ref Res, bool SetPF = false) { HandleNZ00Write(); // x - 0 = x. NZ set according to Res. C always set. V always unset. This // matches what we want since we want carry inverted. // // This is currently worse for 8/16-bit, but that should be optimized. TODO - if (SrcSize >= 4) { + if (SrcSize >= OpSize::i32Bit) { if (SetPF) { - CalculatePF(_SubWithFlags(IR::SizeToOpSize(SrcSize), Res, _Constant(0))); + CalculatePF(_SubWithFlags(SrcSize, Res, _Constant(0))); } else { - _SubNZCV(IR::SizeToOpSize(SrcSize), Res, _Constant(0)); + _SubNZCV(SrcSize, Res, _Constant(0)); } PossiblySetNZCVBits |= 1u << IndexNZCV(FEXCore::X86State::RFLAG_CF_RAW_LOC); CFInverted = true; } else { - _TestNZ(IR::SizeToOpSize(SrcSize), Res, Res); + _TestNZ(SrcSize, Res, Res); CFInverted = false; if (SetPF) { @@ -1653,7 +1655,7 @@ class OpDispatchBuilder final : public IREmitter { } } - void SetNZP_ZeroCV(unsigned SrcSize, Ref Res) { + void SetNZP_ZeroCV(IR::OpSize SrcSize, Ref Res) { SetNZ_ZeroCV(SrcSize, Res, true); } @@ -1705,8 +1707,8 @@ class OpDispatchBuilder final : public IREmitter { HandleNZCVWrite(); CFInverted = true; - if (Size < 4) { - _TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (8 * Size)) - 1)); + if (Size < OpSize::i32Bit) { + _TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (IR::OpSizeAsBits(Size))) - 1)); } else { _TestNZ(Size, Src, Src); } @@ -1882,7 +1884,7 @@ class OpDispatchBuilder final : public IREmitter { LOGMAN_THROW_AA_FMT(Index < 64, "valid index"); uint64_t Bit = (1ull << (uint64_t)Index); - if (Size == 16 && (RegCache.Partial & Bit)) { + if (Size == OpSize::i128Bit && (RegCache.Partial & Bit)) { // We need to load the full register extend if we previously did a partial access. Ref Value = RegCache.Value[Index]; Ref Full = _LoadContext(Size, RegClass, Offset); @@ -1902,7 +1904,7 @@ class OpDispatchBuilder final : public IREmitter { RegCache.Value[Index] = _LoadContext(Size, RegClass, Offset); // We may have done a partial load, this requires special handling. - if (Size == 8) { + if (Size == OpSize::i64Bit) { RegCache.Partial |= Bit; } } else if (Index == PFIndex) { @@ -1938,12 +1940,13 @@ class OpDispatchBuilder final : public IREmitter { // Try to load a pair into the cache uint64_t Bits = (3ull << (uint64_t)Index); - if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / Size) < 64)) { + const auto SizeInt = IR::OpSizeToSize(Size); + if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / SizeInt) < 64)) { auto Values = LoadContextPair_Uncached(RegClass, Size, Offset); RegCache.Value[Index] = Values.Low; RegCache.Value[Index + 1] = Values.High; RegCache.Cached |= Bits; - if (Size == 8) { + if (Size == OpSize::i64Bit) { RegCache.Partial |= Bits; } return Values; @@ -1952,7 +1955,7 @@ class OpDispatchBuilder final : public IREmitter { // Fallback on a pair of loads return { .Low = LoadRegCache(Offset, Index, RegClass, Size), - .High = LoadRegCache(Offset + Size, Index + 1, RegClass, Size), + .High = LoadRegCache(Offset + SizeInt, Index + 1, RegClass, Size), }; } @@ -2427,10 +2430,11 @@ class OpDispatchBuilder final : public IREmitter { } AddressMode SelectPairAddressMode(AddressMode A, IR::OpSize Size) { + const auto SizeInt = IR::OpSizeToSize(Size); AddressMode Out {}; - signed OffsetEl = A.Offset / Size; - if ((A.Offset % Size) == 0 && OffsetEl >= -64 && OffsetEl < 64) { + signed OffsetEl = A.Offset / SizeInt; + if ((A.Offset % SizeInt) == 0 && OffsetEl >= -64 && OffsetEl < 64) { Out.Offset = A.Offset; A.Offset = 0; } @@ -2477,6 +2481,7 @@ class OpDispatchBuilder final : public IREmitter { void _StoreMemPairAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, Ref Value1, Ref Value2, IR::OpSize Align = IR::OpSize::i8Bit) { + const auto SizeInt = IR::OpSizeToSize(Size); bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; // Use stp if possible, otherwise fallback on two stores. @@ -2485,7 +2490,7 @@ class OpDispatchBuilder final : public IREmitter { _StoreMemPair(Class, Size, Value1, Value2, A.Base, A.Offset); } else { _StoreMemAutoTSO(Class, Size, A, Value1, OpSize::i8Bit); - A.Offset += Size; + A.Offset += SizeInt; _StoreMemAutoTSO(Class, Size, A, Value2, OpSize::i8Bit); } } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index a39447a453..14cba5db33 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -74,8 +74,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx}, {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx}, - {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<4>}, - {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<8>}, + {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK}, + {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK}, {OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i32Bit>}, {OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i64Bit>}, @@ -158,7 +158,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, - {OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>}, + {OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm}, {OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, false>}, {OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, true>}, @@ -379,8 +379,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(3, 0b01, 0x00), 1, &OpDispatchBuilder::AVX128_VPERMQ}, {OPD(3, 0b01, 0x01), 1, &OpDispatchBuilder::AVX128_VPERMQ}, {OPD(3, 0b01, 0x02), 1, &OpDispatchBuilder::AVX128_VBLEND}, - {OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>}, - {OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm<8>}, + {OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPERMILImm}, + {OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::AVX128_VPERMILImm}, {OPD(3, 0b01, 0x06), 1, &OpDispatchBuilder::AVX128_VPERM2}, {OPD(3, 0b01, 0x08), 1, &OpDispatchBuilder::AVX128_VectorRound}, {OPD(3, 0b01, 0x09), 1, &OpDispatchBuilder::AVX128_VectorRound}, @@ -665,7 +665,7 @@ void OpDispatchBuilder::AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize El void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper) { - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); RefPair Result {}; @@ -680,9 +680,9 @@ void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, I AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } -void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR::OpSize ElementSize, +void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper) { - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); @@ -698,9 +698,9 @@ void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, IR:: AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } -void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, Ref Src3, - std::function Helper) { - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; +void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3, + std::function Helper) { + const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); @@ -984,13 +984,13 @@ void OpDispatchBuilder::AVX128_VBROADCAST(OpcodeArgs) { template void OpDispatchBuilder::AVX128_VPUNPCKL(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VPUNPCKH(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip2(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } @@ -1039,7 +1039,7 @@ void OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR(OpcodeArgs) { Result.Low = _VSToFVectorInsert(DstSize, DstElementSize, DstElementSize, Src1.Low, Src2.Low, false, false); } - [[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit; LOGMAN_THROW_A_FMT(Is128Bit, "Programming Error: This should never occur!"); Result.High = LoadZeroVector(OpSize::i128Bit); @@ -1073,33 +1073,33 @@ void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs) { } void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VAndn(OpSize::i128Bit, _ElementSize, Src2, Src1); }); } template void OpDispatchBuilder::AVX128_VPACKSS(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VSQXTNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VPACKUS(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VSQXTUNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } Ref OpDispatchBuilder::AVX128_PSIGNImpl(IR::OpSize ElementSize, Ref Src1, Ref Src2) { - Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, (ElementSize * 8) - 1); - Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, (ElementSize * 8) - 1); + Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1); + Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1); return _VMul(OpSize::i128Bit, ElementSize, Src1, Control); } template void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); }); } @@ -1154,7 +1154,7 @@ void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) { .CompType = CompType, }; - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return VFCMPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2, Capture.CompType); }); } @@ -1234,7 +1234,7 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) { } // AVX version only operates on 128-bit. - const uint8_t NumElements = std::min(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize; + const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize); Index &= NumElements - 1; if (Op->Dest.IsGPR()) { @@ -1251,14 +1251,14 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) { } void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed) { - const auto DstSize = GetDstSize(Op); + const auto DstSize = OpSizeFromDst(Op); const auto GetSrc = [&] { if (Op->Src[0].IsGPR()) { return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low; } else { // For memory operands the 256-bit variant loads twice the size specified in the table. - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto SrcSize = OpSizeFromSrc(Op); const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize; @@ -1267,8 +1267,7 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme }; auto Transform = [=, this](Ref Src) { - for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; - CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) { + for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) { if (Signed) { Src = _VSXTL(OpSize::i128Bit, CurrentElementSize, Src); } else { @@ -1286,8 +1285,8 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme Result.Low = Transform(Src); } else { // 256-bit operation is a bit special. It splits the incoming source between lower and upper registers. - size_t TotalElementCount = OpSize::i256Bit / DstElementSize; - size_t TotalElementsToSplitSize = (TotalElementCount / 2) * ElementSize; + size_t TotalElementCount = IR::NumElements(OpSize::i256Bit, DstElementSize); + size_t TotalElementsToSplitSize = (TotalElementCount / 2) * IR::OpSizeToSize(ElementSize); // Split the number of elements in half between lower and upper. Ref SrcHigh = _VDupElement(OpSize::i128Bit, IR::SizeToOpSize(TotalElementsToSplitSize), Src, 1); @@ -1303,10 +1302,10 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize Eleme AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } -template +template void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs) { - const auto SrcSize = GetSrcSize(Op); - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto SrcSize = OpSizeFromSrc(Op); + const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); @@ -1385,7 +1384,7 @@ void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) { void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) { - const auto NumElements = OpSize::i128Bit / ElementSize; + const auto NumElements = IR::NumElements(OpSize::i128Bit, ElementSize); const uint64_t Index = Imm.Literal() & (NumElements - 1); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Src1Op, Op->Flags, false); @@ -1419,7 +1418,7 @@ void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) { } void OpDispatchBuilder::AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) { DeriveOp(Shift, IROp, _VUShr(OpSize::i128Bit, ElementSize, Src1, Src2, true)); return Shift; }); @@ -1431,7 +1430,7 @@ void OpDispatchBuilder::AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir) { const bool Right = Dir == ShiftDirection::RIGHT; const uint64_t Shift = Op->Src[1].Literal(); - const uint64_t ExtrShift = Right ? Shift : OpSize::i128Bit - Shift; + const uint64_t ExtrShift = Right ? Shift : IR::OpSizeToSize(OpSize::i128Bit) - Shift; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); @@ -1486,40 +1485,40 @@ void OpDispatchBuilder::AVX128_VINSERTPS(OpcodeArgs) { template void OpDispatchBuilder::AVX128_VPHSUB(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBOpImpl(OpSize::i128Bit, Src1, Src2, _ElementSize); }); } void OpDispatchBuilder::AVX128_VPHSUBSW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBSOpImpl(OpSize::i128Bit, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VADDSUBP(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return ADDSUBPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VPMULL(OpcodeArgs) { - static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit"); + static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULLOpImpl(OpSize::i128Bit, ElementSize, Signed, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMULHRSW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULHRSWOpImpl(OpSize::i128Bit, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VPMULHW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { if (Signed) { return _VSMulH(OpSize::i128Bit, _ElementSize, Src1, Src2); } else { @@ -1546,9 +1545,9 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); - const auto IsFloatSrc = SrcElementSize == 4; - auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; - auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit; + auto Is128BitSrc = SrcSize == OpSize::i128Bit; + auto Is128BitDst = DstSize == OpSize::i128Bit; ///< Decompose correctly. if (DstElementSize > SrcElementSize && !Is128BitDst) { @@ -1630,7 +1629,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) { auto Convert = [this](Ref Src) -> Ref { auto ElementSize = SrcElementSize; if (Narrow) { - ElementSize = IR::DivideOpSize(ElementSize, 2); + ElementSize = ElementSize >> 1; Src = _Vector_FToF(OpSize::i128Bit, ElementSize, Src, SrcElementSize); } @@ -1663,7 +1662,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) { template void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) { const auto Size = OpSizeFromDst(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; RefPair Src = [&] { if (Widen && !Op->Src[0].IsGPR()) { @@ -1682,7 +1681,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs) { if (Widen) { DeriveOp(Extended, Op, _VSXTL(OpSize::i128Bit, ElementSize, Src)); Src = Extended; - ElementSize = IR::MultiplyOpSize(ElementSize, 2); + ElementSize = ElementSize << 1; } return _Vector_SToF(OpSize::i128Bit, ElementSize, Src); @@ -1732,23 +1731,23 @@ void OpDispatchBuilder::AVX128_VAESImc(OpcodeArgs) { } void OpDispatchBuilder::AVX128_VAESEnc(OpcodeArgs) { - AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), - [this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); }); + AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), + [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESEncLast(OpcodeArgs) { - AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), - [this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); }); + AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), + [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESDec(OpcodeArgs) { - AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), - [this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); }); + AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), + [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESDecLast(OpcodeArgs) { - AVX128_VectorTrinaryImpl(Op, GetDstSize(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), - [this](size_t, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); }); + AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), + [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESKeyGenAssist(OpcodeArgs) { @@ -1838,7 +1837,7 @@ template void OpDispatchBuilder::AVX128_VDPP(OpcodeArgs) { const uint64_t Literal = Op->Src[2].Literal(); - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, Literal](IR::OpSize, Ref Src1, Ref Src2) { return DPPOpImpl(OpSize::i128Bit, Src1, Src2, Literal, ElementSize); }); } @@ -1927,7 +1926,7 @@ void OpDispatchBuilder::AVX128_VSHUF(OpcodeArgs) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } -template +template void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; @@ -1967,31 +1966,31 @@ void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs) { template void OpDispatchBuilder::AVX128_VHADDP(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) { DeriveOp(Res, IROp, _VFAddP(OpSize::i128Bit, ElementSize, Src1, Src2)); return Res; }); } void OpDispatchBuilder::AVX128_VPHADDSW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i16Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHADDSOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMADDUBSW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDUBSWOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMADDWD(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDWDOpImpl(OpSize::i128Bit, Src1, Src2); }); } template void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; const uint64_t Selector = Op->Src[2].Literal(); ///< High Selector shift depends on element size: @@ -2017,19 +2016,19 @@ void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs) { template void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), ElementSize, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize, Ref Src1, Ref Src2) { return HSUBPOpImpl(OpSize::i128Bit, ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) { auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit); - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector); }); } void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetDstSize(Op), OpSize::i8Bit, + AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this](IR::OpSize, Ref Src1, Ref Src2) { return PSADBWOpImpl(OpSize::i128Bit, Src1, Src2); }); } @@ -2061,7 +2060,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) { const auto SanitizedDstSize = std::min(Size, OpSize::i128Bit); AVX128_VectorBinaryImpl(Op, Size, SanitizedDstSize, [this, Index](IR::OpSize SanitizedDstSize, Ref Src1, Ref Src2) -> Ref { - if (Index >= (SanitizedDstSize * 2)) { + if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) { // If the immediate is greater than both vectors combined then it zeroes the vector return LoadZeroVector(OpSize::i128Bit); } @@ -2076,7 +2075,7 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) { void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) { - const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = DstSize == OpSize::i128Bit; auto Mask = AVX128_LoadSource_WithOpSize(Op, MaskOp, Op->Flags, !Is128Bit); @@ -2098,14 +2097,14 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, auto Address = MakeAddress(DataOp); RefPair Result {}; - Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit); + Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, 1); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { ///< TODO: This can be cleaner if AVX128_LoadSource_WithOpSize could return both constructed addresses. auto AddressHigh = _Add(OpSize::i64Bit, Address, _Constant(16)); - Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, OpSize::i8Bit); + Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, 1); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } @@ -2124,7 +2123,7 @@ void OpDispatchBuilder::AVX128_VMASKMOV(OpcodeArgs) { void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) { ///< This instruction only supports 128-bit. const auto Size = OpSizeFromSrc(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; auto MaskSrc = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); @@ -2147,11 +2146,9 @@ void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) { template void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; const auto Src3Selector = Op->Src[2].Literal(); - constexpr auto ElementSizeBits = ElementSize * 8; - auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); @@ -2163,6 +2160,7 @@ void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs) { } auto Convert = [this](Ref Src1, Ref Src2, Ref Mask) { + const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); Ref Shifted = _VSShrI(OpSize::i128Bit, ElementSize, Mask, ElementSizeBits - 1); return _VBSL(OpSize::i128Bit, Shifted, Src2, Src1); }; @@ -2248,7 +2246,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) { Ref ZeroConst = _Constant(0); Ref OneConst = _Constant(1); - const auto ElementSizeInBits = ElementSize * 8; + const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize); { // Calculate ZF first. @@ -2292,7 +2290,7 @@ void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs) { } // As in PTest, this sets Z appropriately while zeroing the rest of NZCV. - SetNZ_ZeroCV(32, ZF); + SetNZ_ZeroCV(OpSize::i32Bit, ZF); SetCFInverted(CFInv); ZeroPF_AF(); } @@ -2339,14 +2337,14 @@ void OpDispatchBuilder::AVX128_PTest(OpcodeArgs) { // Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on // the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is // cleared even if the 16-bit scalars were negative. - SetNZ_ZeroCV(32, Test1); + SetNZ_ZeroCV(OpSize::i32Bit, Test1); SetCFInverted(Test2); ZeroPF_AF(); } template void OpDispatchBuilder::AVX128_VPERMILReg(OpcodeArgs) { - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this](size_t _ElementSize, Ref Src, Ref Indices) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src, Ref Indices) { return VPERMILRegOpImpl(OpSize::i128Bit, ElementSize, Src, Indices); }); } @@ -2376,7 +2374,7 @@ void OpDispatchBuilder::AVX128_VPERMD(OpcodeArgs) { void OpDispatchBuilder::AVX128_VPCLMULQDQ(OpcodeArgs) { const auto Selector = static_cast(Op->Src[2].Literal()); - AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::iInvalid, [this, Selector](size_t _, Ref Src1, Ref Src2) { + AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::iInvalid, [this, Selector](IR::OpSize, Ref Src1, Ref Src2) { return _PCLMUL(OpSize::i128Bit, Src1, Src2, Selector & 0b1'0001); }); } @@ -2548,7 +2546,7 @@ void OpDispatchBuilder::AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Sr OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, OpSize AddrElementSize, RefPair Dest, RefPair Mask, RefVSIB VSIB) { LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size"); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; ///< BaseAddr doesn't need to exist, calculate that here. Ref BaseAddr = VSIB.BaseAddr; @@ -2686,17 +2684,17 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherQPSImpl(Ref Dest, R template void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) { - const auto Size = GetDstSize(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Size = OpSizeFromDst(Op); + const auto Is128Bit = Size == OpSize::i128Bit; ///< Element size is determined by W flag. const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; // We only need the high address register if the number of data elements is more than what the low half can consume. // But also the number of address elements is clamped by the destination size as well. - const size_t NumDataElements = Size / ElementLoadSize; - const size_t NumAddrElementBytes = std::min(Size, (NumDataElements * AddrElementSize)); - const bool NeedsHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit; + const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize); + const size_t NumAddrElementBytes = std::min(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize))); + const bool NeedsHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit); auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, NeedsHighAddrBytes); @@ -2740,7 +2738,7 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) { } else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { Result = AVX128_VPGatherQPSImpl(Dest.Low, Mask.Low, VSIB); } else { - Result = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest, Mask, VSIB); + Result = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest, Mask, VSIB); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); @@ -2754,8 +2752,8 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) { void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) / 2); - const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; - const auto Is128BitDst = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128BitSrc = SrcSize == OpSize::i128Bit; + const auto Is128BitDst = DstSize == OpSize::i128Bit; RefPair Src {}; if (Op->Src[0].IsGPR()) { @@ -2783,7 +2781,7 @@ void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) { void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128BitSrc = SrcSize == OpSize::i128Bit; const auto StoreSize = Op->Dest.IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2); const auto Imm8 = Op->Src[1].Literal(); @@ -2814,7 +2812,7 @@ void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) { // We need to eliminate upper junk if we're storing into a register with // a 256-bit source (VCVTPS2PH's destination for registers is an XMM). - if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) { + if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) { Result = AVX128_Zext(Result.Low); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp index bf1fccdf46..2dcc7f123c 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp @@ -322,7 +322,7 @@ void OpDispatchBuilder::AESEncOp(OpcodeArgs) { void OpDispatchBuilder::VAESEncOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - [[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESENC. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENC unimplemented"); @@ -343,7 +343,7 @@ void OpDispatchBuilder::AESEncLastOp(OpcodeArgs) { void OpDispatchBuilder::VAESEncLastOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - [[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESENCLAST. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENCLAST unimplemented"); @@ -364,7 +364,7 @@ void OpDispatchBuilder::AESDecOp(OpcodeArgs) { void OpDispatchBuilder::VAESDecOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - [[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESDEC. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDEC unimplemented"); @@ -385,7 +385,7 @@ void OpDispatchBuilder::AESDecLastOp(OpcodeArgs) { void OpDispatchBuilder::VAESDecLastOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - [[maybe_unused]] const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + [[maybe_unused]] const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESDECLAST. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDECLAST unimplemented"); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp index 05070f8981..22a4abda08 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp @@ -139,8 +139,8 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) { } void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref Src2, bool Sub) { - auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; - uint64_t SignBit = (SrcSize * 8) - 1; + const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; + const uint64_t SignBit = IR::OpSizeAsBits(SrcSize) - 1; Ref Anded = nullptr; // For add, OF is set iff the sources have the same sign but the destination @@ -171,7 +171,7 @@ void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref S } } - SetRFLAG(Anded, SrcSize * 8 - 1, true); + SetRFLAG(Anded, SignBit, true); } Ref OpDispatchBuilder::LoadPFRaw(bool Mask, bool Invert) { @@ -265,7 +265,7 @@ Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) { Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2) { auto Zero = _InlineConstant(0); auto One = _InlineConstant(1); - auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; + auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; Ref Res; CalculateAF(Src1, Src2); @@ -277,7 +277,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2 CFInverted = false; } else { // Need to zero-extend for correct comparisons below - Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2); + Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2); // Note that we do not extend Src2PlusCF, since we depend on proper // 32-bit arithmetic to correctly handle the Src2 = 0xffff case. @@ -285,7 +285,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2 // Need to zero-extend for the comparison. Res = _Add(OpSize, Src1, Src2PlusCF); - Res = _Bfe(OpSize, SrcSize * 8, 0, Res); + Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res); // TODO: We can fold that second Bfe in (cmp uxth). auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Res, Src2PlusCF, One, Zero); @@ -302,7 +302,7 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2 Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2) { auto Zero = _InlineConstant(0); auto One = _InlineConstant(1); - auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; + auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; CalculateAF(Src1, Src2); @@ -316,13 +316,13 @@ Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2 CFInverted = true; } else { // Zero extend for correct comparison behaviour with Src1 = 0xffff. - Src1 = _Bfe(OpSize, SrcSize * 8, 0, Src1); - Src2 = _Bfe(OpSize, SrcSize * 8, 0, Src2); + Src1 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src1); + Src2 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src2); auto Src2PlusCF = IncrementByCarry(OpSize, Src2); Res = _Sub(OpSize, Src1, Src2PlusCF); - Res = _Bfe(OpSize, SrcSize * 8, 0, Res); + Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res); auto SelectCFInv = _Select(FEXCore::IR::COND_UGE, Src1, Src2PlusCF, One, Zero); @@ -345,9 +345,9 @@ Ref OpDispatchBuilder::CalculateFlags_SUB(IR::OpSize SrcSize, Ref Src1, Ref Src2 Ref Res; if (SrcSize >= OpSize::i32Bit) { - Res = _SubWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2); + Res = _SubWithFlags(SrcSize, Src1, Src2); } else { - _SubNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2); + _SubNZCV(SrcSize, Src1, Src2); Res = _Sub(OpSize::i32Bit, Src1, Src2); } @@ -375,9 +375,9 @@ Ref OpDispatchBuilder::CalculateFlags_ADD(IR::OpSize SrcSize, Ref Src1, Ref Src2 Ref Res; if (SrcSize >= OpSize::i32Bit) { - Res = _AddWithFlags(IR::SizeToOpSize(SrcSize), Src1, Src2); + Res = _AddWithFlags(SrcSize, Src1, Src2); } else { - _AddNZCV(IR::SizeToOpSize(SrcSize), Src1, Src2); + _AddNZCV(SrcSize, Src1, Src2); Res = _Add(OpSize::i32Bit, Src1, Src2); } @@ -400,7 +400,7 @@ void OpDispatchBuilder::CalculateFlags_MUL(IR::OpSize SrcSize, Ref Res, Ref High // CF and OF are set if the result of the operation can't be fit in to the destination register // If the value can fit then the top bits will be zero - auto SignBit = _Sbfe(OpSize::i64Bit, 1, SrcSize * 8 - 1, Res); + auto SignBit = _Sbfe(OpSize::i64Bit, 1, IR::OpSizeAsBits(SrcSize) - 1, Res); _SubNZCV(OpSize::i64Bit, High, SignBit); // If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF @@ -415,7 +415,7 @@ void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) { InvalidatePF_AF(); auto Zero = _InlineConstant(0); - OpSize Size = IR::SizeToOpSize(GetOpSize(High)); + const auto Size = GetOpSize(High); // CF and OF are set if the result of the operation can't be fit in to the destination register // The result register will be all zero if it can't fit due to how multiplication behaves @@ -442,7 +442,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re return; } - auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; + auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; SetNZ_ZeroCV(SrcSize, UnmaskedRes); @@ -451,7 +451,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re // Extract the last bit shifted in to CF. Shift is already masked, but for // 8/16-bit it might be >= SrcSizeBits, in which case CF is cleared. There's // nothing to do in that case since we already cleared CF above. - auto SrcSizeBits = SrcSize * 8; + const auto SrcSizeBits = IR::OpSizeAsBits(SrcSize); if (Shift < SrcSizeBits) { SetCFDirect(Src1, SrcSizeBits - Shift, true); } @@ -464,7 +464,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Re // In the case of left shift. OF is only set from the result of XOR if (Shift == 1) { auto Xor = _Xor(OpSize, UnmaskedRes, Src1); - SetRFLAG(Xor, SrcSize * 8 - 1, true); + SetRFLAG(Xor, IR::OpSizeAsBits(SrcSize) - 1, true); } else { // Undefined, we choose to zero as part of SetNZ_ZeroCV } @@ -515,7 +515,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightImmediate(IR::OpSize SrcSize, R // Only defined when Shift is 1 else undefined // Is set to the MSB of the original value if (Shift == 1) { - SetRFLAG(Src1, SrcSize * 8 - 1, true); + SetRFLAG(Src1, IR::OpSizeAsBits(SrcSize) - 1, true); } } } @@ -526,7 +526,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS return; } - const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; + const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; CalculateFlags_ShiftRightImmediateCommon(SrcSize, Res, Src1, Shift); // OF @@ -536,7 +536,7 @@ void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcS // XOR of Result and Src1 if (Shift == 1) { auto val = _Xor(OpSize, Src1, Res); - SetRFLAG(val, SrcSize * 8 - 1, true); + SetRFLAG(val, IR::OpSizeAsBits(SrcSize) - 1, true); } } } @@ -549,7 +549,7 @@ void OpDispatchBuilder::CalculateFlags_ZCNT(IR::OpSize SrcSize, Ref Result) { // Now set CF if the Result = SrcSize * 8. Since SrcSize is a power-of-two and // Result is <= SrcSize * 8, we equivalently check if the log2(SrcSize * 8) // bit is set. No masking is needed because no higher bits could be set. - unsigned CarryBit = FEXCore::ilog2(SrcSize * 8u); + unsigned CarryBit = FEXCore::ilog2(IR::OpSizeAsBits(SrcSize)); SetCFDirect(Result, CarryBit); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 1950763e63..dd3b9a64da 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -418,7 +418,7 @@ void OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) { // Always 32-bit. const auto ElementSize = OpSize::i32Bit; // Always signed - Dest = _VSToFVectorInsert(IR::SizeToOpSize(DstSize), ElementSize, ElementSize, Dest, Src, true, false); + Dest = _VSToFVectorInsert(DstSize, ElementSize, ElementSize, Dest, Src, true, false); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Dest, DstSize, OpSize::iInvalid); } @@ -482,7 +482,7 @@ Ref OpDispatchBuilder::InsertScalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSiz Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - return _VFToFScalarInsert(IR::SizeToOpSize(DstSize), DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits); + return _VFToFScalarInsert(DstSize, DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits); } template @@ -530,7 +530,7 @@ Ref OpDispatchBuilder::InsertScalarRoundImpl(OpcodeArgs, IR::OpSize DstSize, IR: Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); const auto SourceMode = TranslateRoundType(Mode); - auto ALUOp = _VFToIScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, SourceMode, ZeroUpperBits); + auto ALUOp = _VFToIScalarInsert(DstSize, ElementSize, Src1, Src2, SourceMode, ZeroUpperBits); return ALUOp; } @@ -600,7 +600,7 @@ void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs) { Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags); Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false); + Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, false); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid); } @@ -619,7 +619,7 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) { Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags); Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true); + Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType, true); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid); } @@ -741,10 +741,10 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) { for (unsigned i = 0; i < NumElements; ++i) { // Extract the top bit of the element Ref Tmp = _VExtractToGPR(Size, ElementSize, Src, i); - Tmp = _Bfe(IR::SizeToOpSize(ElementSize), 1, ElementSize * 8 - 1, Tmp); + Tmp = _Bfe(ElementSize, 1, IR::OpSizeAsBits(ElementSize) - 1, Tmp); // Shift it to the correct location - Tmp = _Lshl(IR::SizeToOpSize(ElementSize), Tmp, _Constant(i)); + Tmp = _Lshl(ElementSize, Tmp, _Constant(i)); // Or it with the current value CurrentVal = _Or(OpSize::i64Bit, CurrentVal, Tmp); @@ -755,7 +755,7 @@ void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) { void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; const auto ExtractSize = Is256Bit ? OpSize::i32Bit : OpSize::i16Bit; Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -767,7 +767,7 @@ void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) { // Since we also handle the MM MOVMSKB here too, // we need to clamp the lower bound. const auto VAdd1Size = std::max(SrcSize, OpSize::i128Bit); - const auto VAdd2Size = std::max(IR::DivideOpSize(SrcSize, 2), OpSize::i64Bit); + const auto VAdd2Size = std::max(SrcSize >> 1, OpSize::i64Bit); auto VAdd1 = _VAddP(VAdd1Size, OpSize::i8Bit, VAnd, VAnd); auto VAdd2 = _VAddP(VAdd2Size, OpSize::i8Bit, VAdd1, VAdd1); @@ -790,7 +790,7 @@ void OpDispatchBuilder::PUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) { void OpDispatchBuilder::VPUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -819,8 +819,7 @@ void OpDispatchBuilder::PUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) { void OpDispatchBuilder::VPUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; - + const auto Is128Bit = SrcSize == OpSize::i128Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -852,7 +851,7 @@ Ref OpDispatchBuilder::GeneratePSHUFBMask(IR::OpSize SrcSize) { } Ref OpDispatchBuilder::PSHUFBOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, Ref MaskVector) { - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; // We perform the 256-bit version as two 128-bit operations due to // the lane splitting behavior, so cap the maximum size at 16. @@ -1173,7 +1172,7 @@ void OpDispatchBuilder::PSHUFDOp(OpcodeArgs) { void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; auto Shuffle = Op->Src[1].Literal(); Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -1195,7 +1194,7 @@ void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low) if (Is256Bit) { for (size_t i = 0; i < 4; i++) { const auto Index = Shuffle & 0b11; - const auto UpperLaneOffset = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize; + const auto UpperLaneOffset = IR::NumElements(OpSize::i128Bit, ElementSize); const auto LowDstIndex = BaseElement + i; const auto LowSrcIndex = BaseElement + Index; @@ -1224,10 +1223,10 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele // Since 256-bit variants and up don't lane cross, we can construct // everything in terms of the 128-variant, as each lane is essentially // its own 128-bit segment. - const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize; + const uint8_t NumElements = IR::NumElements(OpSize::i128Bit, ElementSize); const uint8_t HalfNumElements = NumElements >> 1; - const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const bool Is256Bit = DstSize == OpSize::i256Bit; std::array Srcs {}; for (size_t i = 0; i < HalfNumElements; ++i) { @@ -1248,7 +1247,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize Ele // AVX differs the behavior of VSHUFPD and VSHUFPS. // The same immediate bits are used for both lanes with VSHUFPS, // but VSHUFPD uses different immediate bits for each lane. - const auto SrcIndex2 = ElementSize == 4 ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask); + const auto SrcIndex2 = ElementSize == OpSize::i32Bit ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask); Ref Insert = _VInsElement(DstSize, ElementSize, Element, SrcIndex1, Dest, Srcs[Element]); Dest = _VInsElement(DstSize, ElementSize, Element + NumElements, SrcIndex2 + NumElements, Insert, Srcs[Element]); @@ -1442,7 +1441,7 @@ void OpDispatchBuilder::VANDNOp(OpcodeArgs) { template void OpDispatchBuilder::VHADDPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -1485,7 +1484,7 @@ void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs, IR::OpSize ElementSize) { Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) { const auto Size = OpSizeFromDst(Op); - const auto NumElements = Size / ElementSize; + const auto NumElements = IR::NumElements(Size, ElementSize); const uint64_t Index = Imm.Literal() & (NumElements - 1); Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, Size, Op->Flags); @@ -1608,7 +1607,7 @@ void OpDispatchBuilder::PExtrOp(OpcodeArgs, IR::OpSize ElementSize) { } // AVX version only operates on 128-bit. - const uint8_t NumElements = std::min(GetSrcSize(Op), OpSize::i128Bit) / OverridenElementSize; + const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize); Index &= NumElements - 1; if (Op->Dest.IsGPR()) { @@ -1649,8 +1648,8 @@ void OpDispatchBuilder::VEXTRACT128Op(OpcodeArgs) { Ref OpDispatchBuilder::PSIGNImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src1, Ref Src2) { const auto Size = OpSizeFromSrc(Op); - Ref Control = _VSQSHL(Size, ElementSize, Src2, (ElementSize * 8) - 1); - Control = _VSRSHR(Size, ElementSize, Control, (ElementSize * 8) - 1); + Ref Control = _VSQSHL(Size, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1); + Control = _VSRSHR(Size, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1); return _VMul(Size, ElementSize, Src1, Control); } @@ -1725,7 +1724,7 @@ void OpDispatchBuilder::PSRLI(OpcodeArgs, IR::OpSize ElementSize) { void OpDispatchBuilder::VPSRLIOp(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; const uint64_t ShiftConstant = Op->Src[1].Literal(); Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -1848,7 +1847,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) { Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Result = LoadZeroVector(Size); - if (Shift < Size) { + if (Shift < IR::OpSizeToSize(Size)) { Result = _VExtr(Size, OpSize::i8Bit, Result, Dest, Shift); } StoreResult(FPRClass, Op, Result, OpSize::iInvalid); @@ -1856,7 +1855,7 @@ void OpDispatchBuilder::PSRLDQ(OpcodeArgs) { void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = DstSize == OpSize::i128Bit; const uint64_t Shift = Op->Src[1].Literal(); Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -1872,7 +1871,7 @@ void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) { Result = LoadZeroVector(DstSize); if (Is128Bit) { - if (Shift < DstSize) { + if (Shift < IR::OpSizeToSize(DstSize)) { Result = _VExtr(DstSize, OpSize::i8Bit, Result, Src, Shift); } } else { @@ -1899,8 +1898,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) { Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Result = LoadZeroVector(Size); - if (Shift < Size) { - Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, Size - Shift); + if (Shift < IR::OpSizeToSize(Size)) { + Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, IR::OpSizeToSize(Size) - Shift); } StoreResult(FPRClass, Op, Result, OpSize::iInvalid); @@ -1908,7 +1907,8 @@ void OpDispatchBuilder::PSLLDQ(OpcodeArgs) { void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto DstSizeInt = IR::OpSizeToSize(DstSize); + const auto Is128Bit = DstSize == OpSize::i128Bit; const uint64_t Shift = Op->Src[1].Literal(); Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -1922,13 +1922,13 @@ void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) { } else { Result = LoadZeroVector(DstSize); if (Is128Bit) { - if (Shift < DstSize) { - Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift); + if (Shift < DstSizeInt) { + Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift); } } else { if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) { Ref ResultBottom = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Result, 16 - Shift); - Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSize - Shift); + Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift); Result = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ResultBottom, ResultTop); } @@ -1954,7 +1954,7 @@ void OpDispatchBuilder::PSRAIOp(OpcodeArgs, IR::OpSize ElementSize) { void OpDispatchBuilder::VPSRAIOp(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t Shift = Op->Src[1].Literal(); const auto Size = OpSizeFromDst(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Result = Src; @@ -2008,7 +2008,7 @@ void OpDispatchBuilder::MOVDDUPOp(OpcodeArgs) { void OpDispatchBuilder::VMOVDDUPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto IsSrcGPR = Op->Src[0].IsGPR(); - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; const auto MemSize = Is256Bit ? OpSize::i256Bit : OpSize::i64Bit; Ref Src = IsSrcGPR ? LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags) : @@ -2112,7 +2112,7 @@ Ref OpDispatchBuilder::Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcEle auto ElementSize = SrcElementSize; if (Widen) { Src = _VSXTL(Size, ElementSize, Src); - ElementSize = IR::MultiplyOpSize(ElementSize, 2); + ElementSize = ElementSize << 1; } return _Vector_SToF(Size, ElementSize, Src); @@ -2143,8 +2143,8 @@ Ref OpDispatchBuilder::Vector_CVT_Float_To_IntImpl(OpcodeArgs, IR::OpSize SrcEle Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); if (Narrow) { - Src = _Vector_FToF(DstSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize); - ElementSize = IR::DivideOpSize(ElementSize, 2); + Src = _Vector_FToF(DstSize, SrcElementSize >> 1, Src, SrcElementSize); + ElementSize = ElementSize >> 1; } if (HostRoundingMode) { @@ -2236,17 +2236,17 @@ void OpDispatchBuilder::Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElem const auto SrcSize = OpSizeFromSrc(Op); const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit; - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; - const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2) : SrcSize; + const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? (SrcSize >> 1) : SrcSize; Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], LoadSize, Op->Flags); Ref Result {}; if (DstElementSize > SrcElementSize) { - Result = _Vector_FToF(SrcSize, IR::MultiplyOpSize(SrcElementSize, 2), Src, SrcElementSize); + Result = _Vector_FToF(SrcSize, SrcElementSize << 1, Src, SrcElementSize); } else { - Result = _Vector_FToF(SrcSize, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize); + Result = _Vector_FToF(SrcSize, SrcElementSize >> 1, Src, SrcElementSize); } if (IsAVX) { @@ -2269,7 +2269,7 @@ void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Src = _VSXTL(DstSize, ElementSize, Src); - ElementSize = IR::MultiplyOpSize(ElementSize, 2); + ElementSize = ElementSize << 1; // Always signed Src = _Vector_SToF(DstSize, ElementSize, Src); @@ -2294,8 +2294,8 @@ void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) { const auto Size = OpSizeFromDst(Op); if (Narrow) { - Src = _Vector_FToF(Size, IR::DivideOpSize(SrcElementSize, 2), Src, SrcElementSize); - ElementSize = IR::DivideOpSize(ElementSize, 2); + Src = _Vector_FToF(Size, SrcElementSize >> 1, Src, SrcElementSize); + ElementSize = ElementSize >> 1; } if constexpr (HostRoundingMode) { @@ -2816,7 +2816,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand const auto DstSize = OpSizeFromDst(Op); const auto SanitizedDstSize = std::min(DstSize, OpSize::i128Bit); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Index = Imm.Literal(); Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags); @@ -2830,7 +2830,7 @@ Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand } Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags); - if (Index >= (SanitizedDstSize * 2)) { + if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) { // If the immediate is greater than both vectors combined then it zeroes the vector return LoadZeroVector(DstSize); } @@ -2891,7 +2891,7 @@ template void OpDispatchBuilder::PACKUSOp(OpcodeArgs); void OpDispatchBuilder::VPACKUSOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -2919,7 +2919,7 @@ template void OpDispatchBuilder::PACKSSOp(OpcodeArgs); void OpDispatchBuilder::VPACKSSOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -2954,7 +2954,7 @@ Ref OpDispatchBuilder::PMULLOpImpl(OpSize Size, IR::OpSize ElementSize, bool Sig template void OpDispatchBuilder::PMULLOp(OpcodeArgs) { - static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit"); + static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -2968,7 +2968,7 @@ template void OpDispatchBuilder::PMULLOp(OpcodeArgs); template void OpDispatchBuilder::VPMULLOp(OpcodeArgs) { - static_assert(ElementSize == sizeof(uint32_t), "Currently only handles 32-bit -> 64-bit"); + static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3124,15 +3124,15 @@ void OpDispatchBuilder::PMULHRWOp(OpcodeArgs) { // Implementation is more efficient for 8byte registers // Multiplies 4 16bit values in to 4 32bit values - Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Dest, Src); + Res = _VSMull(Size << 1, OpSize::i16Bit, Dest, Src); // Load 0x0000_8000 in to each 32-bit element. Ref VConstant = _VectorImm(OpSize::i128Bit, OpSize::i32Bit, 0x80, 8); - Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, VConstant); + Res = _VAdd(Size << 1, OpSize::i32Bit, Res, VConstant); // Now shift and narrow to convert 32-bit values to 16bit, storing the top 16bits - Res = _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 16); + Res = _VUShrNI(Size << 1, OpSize::i32Bit, Res, 16); StoreResult(FPRClass, Op, Res, OpSize::iInvalid); } @@ -3177,7 +3177,7 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { if (Size == OpSize::i64Bit) { // MMX implementation can be slightly more optimal - Size = IR::DivideOpSize(Size, 2); + Size = Size >> 1; auto MullResult = _VSMull(Size, OpSize::i16Bit, Src1, Src2); return _VAddP(Size, OpSize::i32Bit, MullResult, MullResult); } @@ -3211,7 +3211,7 @@ void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) { Ref OpDispatchBuilder::PMADDUBSWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { if (Size == OpSize::i64Bit) { - const auto MultSize = IR::MultiplyOpSize(Size, 2); + const auto MultSize = Size << 1; // 64bit is more efficient // Src1 is unsigned @@ -3314,11 +3314,11 @@ Ref OpDispatchBuilder::PMULHRSWOpImpl(OpSize Size, Ref Src1, Ref Src2) { Ref Res {}; if (Size == OpSize::i64Bit) { // Implementation is more efficient for 8byte registers - Res = _VSMull(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, Src1, Src2); - Res = _VSShrI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 14); - auto OneVector = _VectorImm(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, 1); - Res = _VAdd(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, OneVector); - return _VUShrNI(IR::MultiplyOpSize(Size, 2), OpSize::i32Bit, Res, 1); + Res = _VSMull(Size << 1, OpSize::i16Bit, Src1, Src2); + Res = _VSShrI(Size << 1, OpSize::i32Bit, Res, 14); + auto OneVector = _VectorImm(Size << 1, OpSize::i32Bit, 1); + Res = _VAdd(Size << 1, OpSize::i32Bit, Res, OneVector); + return _VUShrNI(Size << 1, OpSize::i32Bit, Res, 1); } else { // 128-bit and 256-bit are less efficient Ref ResultLow; @@ -3375,7 +3375,7 @@ template void OpDispatchBuilder::HSUBP(OpcodeArgs); void OpDispatchBuilder::VHSUBPOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3409,7 +3409,7 @@ template void OpDispatchBuilder::PHSUB(OpcodeArgs); void OpDispatchBuilder::VPHSUBOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3441,7 +3441,7 @@ void OpDispatchBuilder::PHADDS(OpcodeArgs) { void OpDispatchBuilder::VPHADDSWOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); - const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = SrcSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3476,7 +3476,7 @@ void OpDispatchBuilder::PHSUBS(OpcodeArgs) { void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3497,13 +3497,13 @@ Ref OpDispatchBuilder::PSADBWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { // but it actually operates in more than 8bit space // This can be seen with `abs(0 - 0xFF)` returning a different result depending // on bit length - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; if (Size == OpSize::i64Bit) { - auto AbsResult = _VUABDL(IR::MultiplyOpSize(Size, 2), OpSize::i8Bit, Src1, Src2); + auto AbsResult = _VUABDL(Size << 1, OpSize::i8Bit, Src1, Src2); // Now vector-wide add the results for each - return _VAddV(IR::MultiplyOpSize(Size, 2), OpSize::i16Bit, AbsResult); + return _VAddV(Size << 1, OpSize::i16Bit, AbsResult); } auto AbsResult_Low = _VUABDL(Size, OpSize::i8Bit, Src1, Src2); @@ -3558,7 +3558,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi return LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags); } else { // For memory operands the 256-bit variant loads twice the size specified in the table. - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto SrcSize = OpSizeFromSrc(Op); const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize; @@ -3569,8 +3569,7 @@ Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSi Ref Src = GetSrc(); Ref Result {Src}; - for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; - CurrentElementSize = IR::MultiplyOpSize(CurrentElementSize, 2)) { + for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) { if (Signed) { Result = _VSXTL(DstSize, CurrentElementSize, Result); } else { @@ -3901,7 +3900,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) // // To emulate this on AArch64 // Arithmetic shift right by the element size, then use BSL to select the registers - Mask = _VSShrI(Size, ElementSize, Mask, (ElementSize * 8) - 1); + Mask = _VSShrI(Size, ElementSize, Mask, IR::OpSizeAsBits(ElementSize) - 1); auto Result = _VBSL(Size, Mask, Src, Dest); @@ -3910,7 +3909,7 @@ void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) void OpDispatchBuilder::AVXVectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); - const auto ElementSizeBits = ElementSize * 8; + const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -3947,7 +3946,7 @@ void OpDispatchBuilder::PTestOpImpl(OpSize Size, Ref Dest, Ref Src) { // Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on // the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is // cleared even if the 16-bit scalars were negative. - SetNZ_ZeroCV(32, Test1); + SetNZ_ZeroCV(OpSize::i32Bit, Test1); SetCFInverted(Test2); ZeroPF_AF(); } @@ -3962,7 +3961,7 @@ void OpDispatchBuilder::PTestOp(OpcodeArgs) { void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2) { InvalidateDeferredFlags(); - const auto ElementSizeInBits = ElementSize * 8; + const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize); const auto MaskConstant = uint64_t {1} << (ElementSizeInBits - 1); Ref Mask = _VDupFromGPR(SrcSize, ElementSize, _Constant(MaskConstant)); @@ -3985,7 +3984,7 @@ void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Ref CFInv = _Select(IR::COND_NEQ, AndNotGPR, ZeroConst, OneConst, ZeroConst); // As in PTest, this sets Z appropriately while zeroing the rest of NZCV. - SetNZ_ZeroCV(32, AndGPR); + SetNZ_ZeroCV(OpSize::i32Bit, AndGPR); SetCFInverted(CFInv); ZeroPF_AF(); } @@ -4083,7 +4082,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t // Now using the destination mask we choose where the result ends up // It can duplicate and zero results - if (ElementSize == 8) { + if (ElementSize == OpSize::i64Bit) { switch (DstMask) { case 0b01: // Dest[63:0] = Result @@ -4105,7 +4104,7 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t auto BadPath = [&]() { Ref Result = ZeroVec; - for (size_t i = 0; i < (DstSize / ElementSize); ++i) { + for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((DstMask & Bit) != 0) { @@ -4127,13 +4126,13 @@ Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Zero - return _VZip(IR::DivideOpSize(DstSize, 2), ElementSize, ZeroVec, Temp); + return _VZip(DstSize >> 1, ElementSize, ZeroVec, Temp); case 0b0011: // Dest[31:0] = Result // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Zero - return _VDupElement(IR::DivideOpSize(DstSize, 2), ElementSize, Temp, 0); + return _VDupElement(DstSize >> 1, ElementSize, Temp, 0); case 0b0100: // Dest[31:0] = Zero // Dest[63:32] = Zero @@ -4251,7 +4250,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Ref Temp = _VFMul(DstSize, ElementSize, Src1V, Src2V); // Now we zero out elements based on src mask - for (size_t i = 0; i < (DstSize / ElementSize); ++i) { + for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((SrcMask & Bit) == 0) { @@ -4272,7 +4271,7 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& // It can duplicate and zero results Ref Result = ZeroVec; - for (size_t i = 0; i < (DstSize / ElementSize); ++i) { + for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((DstMask & Bit) != 0) { @@ -4285,17 +4284,17 @@ Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& template void OpDispatchBuilder::VDPPOp(OpcodeArgs) { - const auto DstSize = GetDstSize(Op); + const auto DstSize = OpSizeFromDst(Op); Ref Result {}; - if (ElementSize == 4 && DstSize == Core::CPUState::XMM_AVX_REG_SIZE) { + if (ElementSize == OpSize::i32Bit && DstSize == OpSize::i256Bit) { // 256-bit DPPS isn't handled by the 128-bit solution. Result = VDPPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]); } else { Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); - Result = DPPOpImpl(OpSizeFromDst(Op), Src1, Src2, Op->Src[2].Literal(), ElementSize); + Result = DPPOpImpl(DstSize, Src1, Src2, Op->Src[2].Literal(), ElementSize); } // We don't need to emit a _VMov to clear the upper lane, since DPPOpImpl uses a zero vector @@ -4306,7 +4305,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) { template void OpDispatchBuilder::VDPPOp(OpcodeArgs); template void OpDispatchBuilder::VDPPOp(OpcodeArgs); -Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) { +Ref OpDispatchBuilder::MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select) { const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) { // Src2 will grab a 32bit element and duplicate it across the 128bits Ref DupSrc = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src2, Selector_Src2); @@ -4373,7 +4372,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t return _VAddP(OpSize::i128Bit, OpSize::i16Bit, TmpTranspose1, TmpTranspose2); }; - const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = SrcSize == OpSize::i128Bit; // Src1 needs to be in byte offset const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8; @@ -4395,7 +4394,7 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) { const uint8_t Select = Op->Src[1].Literal(); - const uint8_t SrcSize = GetSrcSize(Op); + const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -4405,7 +4404,7 @@ void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) { void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) { const uint8_t Select = Op->Src[2].Literal(); - const uint8_t SrcSize = GetSrcSize(Op); + const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -4463,7 +4462,7 @@ void OpDispatchBuilder::VCVTPS2PHOp(OpcodeArgs) { // We need to eliminate upper junk if we're storing into a register with // a 256-bit source (VCVTPS2PH's destination for registers is an XMM). - if (Op->Src[0].IsGPR() && SrcSize == Core::CPUState::XMM_AVX_REG_SIZE) { + if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) { Result = _VMov(OpSize::i128Bit, Result); } @@ -4617,7 +4616,7 @@ Ref OpDispatchBuilder::VBLENDOpImpl(IR::OpSize VecSize, IR::OpSize ElementSize, void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -4642,7 +4641,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) { void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -4686,7 +4685,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) { void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = DstSize == OpSize::i128Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -4718,7 +4717,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) { void OpDispatchBuilder::VZEROOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - const auto IsVZEROALL = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto IsVZEROALL = DstSize == OpSize::i256Bit; const auto NumRegs = CTX->Config.Is64BitMode ? 16U : 8U; if (IsVZEROALL) { @@ -4743,7 +4742,7 @@ void OpDispatchBuilder::VZEROOp(OpcodeArgs) { void OpDispatchBuilder::VPERMILImmOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[1].Literal() & 0xFF; Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); @@ -4780,7 +4779,7 @@ Ref OpDispatchBuilder::VPERMILRegOpImpl(OpSize DstSize, IR::OpSize ElementSize, // The only difference here is that we need to add 16 to the upper lane // before doing the final addition to build up the indices for TBL. - const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = DstSize == OpSize::i256Bit; auto IsPD = ElementSize == OpSize::i64Bit; if (IsPD) { @@ -4856,7 +4855,7 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask // While the control bit immediate for the instruction itself is only ever 8 bits // in size, we use it as a 16-bit value so that we can use the 8th bit to signify // whether or not RAX and RDX should be interpreted as a 64-bit value. - const auto SrcSize = GetSrcSize(Op); + const auto SrcSize = OpSizeFromSrc(Op); const auto Is64Bit = SrcSize == OpSize::i64Bit; const auto NewControl = uint16_t(Control | (uint16_t(Is64Bit) << 8)); @@ -4935,7 +4934,7 @@ void OpDispatchBuilder::VPCMPISTRMOp(OpcodeArgs) { void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = OpSizeFromDst(Op); - const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = Size == OpSize::i256Bit; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; @@ -4964,7 +4963,7 @@ void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Sr void OpDispatchBuilder::VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = OpSizeFromDst(Op); - const auto Is256Bit = Size == Core::CPUState::XMM_AVX_REG_SIZE; + const auto Is256Bit = Size == OpSize::i256Bit; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; @@ -5024,20 +5023,20 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) { LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size"); const auto Size = OpSizeFromDst(Op); - const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; + const auto Is128Bit = Size == OpSize::i128Bit; ///< Element size is determined by W flag. const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; // We only need the high address register if the number of data elements is more than what the low half can consume. // But also the number of address elements is clamped by the destination size as well. - const size_t NumDataElements = Size / ElementLoadSize; - const size_t NumAddrElementBytes = std::min(Size, (NumDataElements * AddrElementSize)); - const bool Needs128BitHighAddrBytes = NumAddrElementBytes > OpSize::i128Bit; + const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize); + const size_t NumAddrElementBytes = std::min(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize))); + const bool Needs128BitHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit); auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags); - const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize); + const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == IR::OpSizeToSize(AddrElementSize)) && (AddrElementSize == ElementLoadSize); Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags); @@ -5067,7 +5066,7 @@ void OpDispatchBuilder::VPGATHER(OpcodeArgs) { } } - auto Result128 = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128); + auto Result128 = AVX128_VPGatherImpl(Size, ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128); // The registers are current split, need to merge them. Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High); } else { diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 2db05ada7f..b165dd1dfb 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -103,7 +103,7 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { // Sign extend to 64bits if (ReadWidth != OpSize::i64Bit) { - Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data); + Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data); } // We're about to clobber flags to grab the sign, so save NZCV. @@ -351,33 +351,33 @@ void OpDispatchBuilder::X87FNSTENV(OpcodeArgs) { _StoreMem(GPRClass, Size, Mem, FCW, Size); } - { _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); } + { _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); } auto ZeroConst = _Constant(0); { // FTW - _StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); + _StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1); } { // Instruction Offset - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1); } { // Instruction CS selector (+ Opcode) - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1); } { // Data pointer offset - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1); } { // Data pointer selector - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, OpSize::i8Bit); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1); } } @@ -407,13 +407,13 @@ void OpDispatchBuilder::X87LDENV(OpcodeArgs) { auto NewFCW = _LoadMem(GPRClass, OpSize::i16Bit, Mem, OpSize::i16Bit); _StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); - Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 1)); + Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 1)); auto NewFSW = _LoadMem(GPRClass, Size, MemLocation, Size); ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW - Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(Size * 2)); + Ref MemLocation = _Add(OpSize::i64Bit, Mem, _Constant(IR::OpSizeToSize(Size) * 2)); SetX87FTW(_LoadMem(GPRClass, Size, MemLocation, Size)); } } @@ -447,58 +447,58 @@ void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) { _StoreMem(GPRClass, Size, Mem, FCW, Size); } - { _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); } + { _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); } auto ZeroConst = _Constant(0); { // FTW - _StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1); + _StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1); } { // Instruction Offset - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, 1); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 3), Size, MEM_OFFSET_SXTX, 1); } { // Instruction CS selector (+ Opcode) - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, 1); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 4), Size, MEM_OFFSET_SXTX, 1); } { // Data pointer offset - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, 1); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 5), Size, MEM_OFFSET_SXTX, 1); } { // Data pointer selector - _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, 1); + _StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(IR::OpSizeToSize(Size) * 6), Size, MEM_OFFSET_SXTX, 1); } auto OneConst = _Constant(1); auto SevenConst = _Constant(7); const auto LoadSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (int i = 0; i < 7; ++i) { - Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass); + Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass); if (ReducedPrecisionMode) { data = _F80CVTTo(data, OpSize::i64Bit); } - _StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + _StoreMem(FPRClass, OpSize::i128Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst); } // The final st(7) needs a bit of special handling here - Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), OpSize::i128Bit, FPRClass); + Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass); if (ReducedPrecisionMode) { data = _F80CVTTo(data, OpSize::i64Bit); } // ST7 broken in to two parts // Lower 64bits [63:0] // upper 16 bits [79:64] - _StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((Size * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + _StoreMem(FPRClass, OpSize::i64Bit, data, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); auto topBytes = _VDupElement(OpSize::i128Bit, OpSize::i16Bit, data, 4); - _StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((Size * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + _StoreMem(FPRClass, OpSize::i16Bit, topBytes, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); // reset to default FNINIT(Op); @@ -522,11 +522,11 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) { _SetRoundingMode(roundingMode, false, roundingMode); } - auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); + auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 1), Size, MEM_OFFSET_SXTX, 1); Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW - SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1)); + SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1)); } auto OneConst = _Constant(1); @@ -538,14 +538,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) { Mask = _VInsGPR(OpSize::i128Bit, OpSize::i64Bit, 1, Mask, high); const auto StoreSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (int i = 0; i < 7; ++i) { - Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((Size * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + Ref Reg = _LoadMem(FPRClass, OpSize::i128Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); // Mask off the top bits Reg = _VAnd(OpSize::i128Bit, OpSize::i128Bit, Reg, Mask); if (ReducedPrecisionMode) { // Convert to double precision Reg = _F80CVT(OpSize::i64Bit, Reg); } - _StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass); + _StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass); Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst); } @@ -554,13 +554,14 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) { // ST7 broken in to two parts // Lower 64bits [63:0] // upper 16 bits [79:64] - Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((Size * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); - Ref RegHigh = _LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((Size * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + Ref Reg = _LoadMem(FPRClass, OpSize::i64Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7)), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); + Ref RegHigh = + _LoadMem(FPRClass, OpSize::i16Bit, Mem, _Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7) + 8), OpSize::i8Bit, MEM_OFFSET_SXTX, 1); Reg = _VInsElement(OpSize::i128Bit, OpSize::i16Bit, 4, 0, Reg, RegHigh); if (ReducedPrecisionMode) { Reg = _F80CVT(OpSize::i64Bit, Reg); // Convert to double precision } - _StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), OpSize::i128Bit, FPRClass); + _StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit), FPRClass); } // Load / Store Control Word diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index 4537132e3e..1d357946cf 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -36,12 +36,12 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) { _SetRoundingMode(roundingMode, false, roundingMode); _StoreContext(OpSize::i16Bit, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); - auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); + auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size)), Size, MEM_OFFSET_SXTX, 1); ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW - SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1)); + SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(IR::OpSizeToSize(Size) * 2), Size, MEM_OFFSET_SXTX, 1)); } } @@ -97,7 +97,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { // Read from memory Ref Data = LoadSource_WithOpSize(GPRClass, Op, Op->Src[0], ReadWidth, Op->Flags); if (ReadWidth == OpSize::i16Bit) { - Data = _Sbfe(OpSize::i64Bit, ReadWidth * 8, 0, Data); + Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data); } auto ConvertedData = _Float_FromGPR_S(OpSize::i64Bit, ReadWidth == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, Data); _PushStack(ConvertedData, Data, ReadWidth, false); @@ -117,9 +117,9 @@ void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) { Ref data = _ReadStackValue(0); if (Truncate) { - data = _Float_ToGPR_ZS(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); + data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); } else { - data = _Float_ToGPR_S(Size == 4 ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); + data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); } StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit); @@ -339,7 +339,7 @@ void OpDispatchBuilder::FCOMIF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpD if (Width == OpSize::i16Bit) { arg = _Sbfe(OpSize::i64Bit, 16, 0, arg); } - b = _Float_FromGPR_S(OpSize::i64Bit, Width == 64 ? OpSize::i64Bit : OpSize::i32Bit, arg); + b = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg); } else if (Width == OpSize::i32Bit) { arg = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); b = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg); diff --git a/FEXCore/Source/Interface/IR/IR.h b/FEXCore/Source/Interface/IR/IR.h index 86279fb79f..3e73f838ed 100644 --- a/FEXCore/Source/Interface/IR/IR.h +++ b/FEXCore/Source/Interface/IR/IR.h @@ -548,7 +548,7 @@ class NodeIterator { // This must directly match bytes to the named opsize. // Implicit sized IR operations does math to get between sizes. -enum OpSize : uint8_t { +enum class OpSize : uint8_t { iUnsized = 0, i8Bit = 1, i16Bit = 2, @@ -615,14 +615,18 @@ static inline uint16_t OpSizeAsBits(IR::OpSize Size) { return IR::OpSizeToSize(Size) * 8u; } -static inline OpSize MultiplyOpSize(IR::OpSize Size, uint8_t Multiplier) { +template +requires (std::is_integral_v) +static inline OpSize operator<<(IR::OpSize Size, T Shift) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); - return IR::SizeToOpSize(IR::OpSizeToSize(Size) * Multiplier); + return IR::SizeToOpSize(IR::OpSizeToSize(Size) << Shift); } -static inline OpSize DivideOpSize(IR::OpSize Size, uint8_t Divisor) { +template +requires (std::is_integral_v) +static inline OpSize operator>>(IR::OpSize Size, T Shift) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); - return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor); + return IR::SizeToOpSize(IR::OpSizeToSize(Size) >> Shift); } static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) { @@ -630,7 +634,9 @@ static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) { return IR::SizeToOpSize(IR::OpSizeToSize(Size) / IR::OpSizeToSize(Divisor)); } -static inline OpSize operator/(IR::OpSize Size, uint8_t Divisor) { +template +requires (std::is_integral_v) +static inline OpSize operator/(IR::OpSize Size, T Divisor) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor); } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 8d59bb8370..cad007c6a2 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -736,7 +736,7 @@ "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ - "Offset % RegisterSize == 0", + "Offset % IR::OpSizeToSize(RegisterSize) == 0", "RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit" ] }, @@ -748,7 +748,7 @@ "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ - "Offset % RegisterSize == 0", + "Offset % IR::OpSizeToSize(RegisterSize) == 0", "RegisterSize == FEXCore::IR::OpSize::i128Bit" ] }, @@ -760,7 +760,7 @@ "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ - "Offset % RegisterSize == 0", + "Offset % IR::OpSizeToSize(RegisterSize) == 0", "RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit" ] } @@ -2017,7 +2017,7 @@ "TiedSource": 0, "Desc": "Unsigned shifts right each element and then narrows to the next lower element size", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VUShrNI2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper, u8:$BitShift": { @@ -2026,73 +2026,73 @@ "Inserts results in to the high elements of the first argument" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Sign extends elements from the source element size to the next size up", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Sign extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSSHLL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": { "Desc": "Sign extends elements from the source element size to the next size up", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSSHLL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": { "Desc": ["Sign extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Zero extends elements from the source element size to the next size up", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Zero extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSQXTN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "TiedSource": 0, "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSQXTN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "TiedSource": 0, "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSQXTNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": ["Does both VSQXTN and VSQXTN2 in a combined operation." ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSQXTUN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSQXTUN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSQXTUNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": ["Does both VSQXTUN and VSQXTUN2 in a combined operation." ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize >> 1)" }, "FPR = VSRSHR OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "Desc": ["Signed rounding shift right by immediate", @@ -2271,24 +2271,24 @@ }, "FPR = VUMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": [ "Does a signed integer multiply with extend.", "ElementSize is the source size" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Multiplies the high elements with size extension", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VSMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Multiplies the high elements with size extension", "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUMulH OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Wide unsigned multiply returning the high results", @@ -2305,14 +2305,14 @@ "Desc": ["Unsigned Absolute Difference Long" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUABDL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Unsigned Absolute Difference Long", "Using the high elements of the source vectors" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))" + "NumElements": "RegisterSize / (ElementSize << 1)" }, "FPR = VUShl OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": { "TiedSource": 0, @@ -2580,7 +2580,7 @@ "Selecting from the high half of the register." ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::MultiplyOpSize(ElementSize, 2))", + "NumElements": "RegisterSize / (ElementSize << 1)", "EmitValidation": [ "RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\"" ] @@ -2594,7 +2594,7 @@ "F64->F32, F32->F16" ], "DestSize": "RegisterSize", - "NumElements": "RegisterSize / (IR::DivideOpSize(ElementSize, 2))", + "NumElements": "RegisterSize / (ElementSize >> 1)", "EmitValidation": [ "RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\"" ] diff --git a/FEXCore/Source/Interface/IR/IRDumper.cpp b/FEXCore/Source/Interface/IR/IRDumper.cpp index c2029179d4..c1fef63d10 100644 --- a/FEXCore/Source/Interface/IR/IRDumper.cpp +++ b/FEXCore/Source/Interface/IR/IRDumper.cpp @@ -112,17 +112,17 @@ static void PrintArg(fextl::stringstream* out, const IRListView* IR, OrderedNode } if (GetHasDest(IROp->Op)) { - uint32_t ElementSize = IROp->ElementSize; - uint32_t NumElements = IROp->Size; - if (!IROp->ElementSize) { + auto ElementSize = IROp->ElementSize; + uint32_t NumElements = 0; + if (IROp->ElementSize == OpSize::iUnsized) { ElementSize = IROp->Size; } - if (ElementSize) { - NumElements /= ElementSize; + if (ElementSize != OpSize::iUnsized) { + NumElements = IR::NumElements(IROp->Size, ElementSize); } - *out << " i" << std::dec << (ElementSize * 8); + *out << " i" << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << "v" << std::dec << NumElements; @@ -296,11 +296,11 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation auto ElementSize = IROp->ElementSize; uint8_t NumElements = 0; - if (!IROp->ElementSize) { + if (IROp->ElementSize != OpSize::iUnsized) { ElementSize = IROp->Size; } - if (ElementSize) { + if (ElementSize != OpSize::iUnsized) { NumElements = IR::NumElements(IROp->Size, ElementSize); } @@ -324,7 +324,7 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation } } - *out << " i" << std::dec << (ElementSize * 8); + *out << " i" << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << "v" << std::dec << NumElements; @@ -334,16 +334,16 @@ void Dump(fextl::stringstream* out, const IRListView* IR, IR::RegisterAllocation } else { auto ElementSize = IROp->ElementSize; - if (!IROp->ElementSize) { + if (IROp->ElementSize == OpSize::iUnsized) { ElementSize = IROp->Size; } uint32_t NumElements = 0; - if (ElementSize) { + if (ElementSize != OpSize::iUnsized) { NumElements = IR::NumElements(IROp->Size, ElementSize); } *out << "(%" << std::dec << ID << ' '; - *out << 'i' << std::dec << (ElementSize * 8); + *out << 'i' << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << 'v' << std::dec << NumElements; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index bf33844393..0cfc4027be 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -71,19 +71,18 @@ class IREmitter { return _Jump(InvalidNode); } IRPair _CondJump(Ref ssa0, CondClassType cond = {COND_NEQ}) { - return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, IR::SizeToOpSize(GetOpSize(ssa0))); + return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, GetOpSize(ssa0)); } IRPair _CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClassType cond = {COND_NEQ}) { - return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, IR::SizeToOpSize(GetOpSize(ssa0))); + return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, GetOpSize(ssa0)); } // TODO: Work to remove this implicit sized Select implementation. - IRPair _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, uint8_t CompareSize = 0) { - if (CompareSize == 0) { - CompareSize = std::max(4, std::max(GetOpSize(ssa0), GetOpSize(ssa1))); + IRPair _Select(uint8_t Cond, Ref ssa0, Ref ssa1, Ref ssa2, Ref ssa3, IR::OpSize CompareSize = OpSize::iUnsized) { + if (CompareSize == OpSize::iUnsized) { + CompareSize = std::max(OpSize::i32Bit, std::max(GetOpSize(ssa0), GetOpSize(ssa1))); } - return _Select(IR::SizeToOpSize(std::max(4, std::max(GetOpSize(ssa2), GetOpSize(ssa3)))), - IR::SizeToOpSize(CompareSize), CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3); + return _Select(std::max(OpSize::i32Bit, std::max(GetOpSize(ssa2), GetOpSize(ssa3))), CompareSize, CondClassType {Cond}, ssa0, ssa1, ssa2, ssa3); } IRPair _LoadMem(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, Ref ssa0, IR::OpSize Align = OpSize::i8Bit) { return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1); diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index 5ead4ffe79..7209e329a1 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -29,7 +29,7 @@ desc: ConstProp, ZExt elim, const pooling, fcmp reduction, const inlining namespace FEXCore::IR { uint64_t getMask(IROp_Header* Op) { - uint64_t NumBits = Op->Size * 8; + uint64_t NumBits = IR::OpSizeAsBits(Op->Size); return (~0ULL) >> (64 - NumBits); } @@ -91,7 +91,7 @@ class ConstProp final : public FEXCore::IR::Pass { // We don't allow 8/16-bit operations to have constants, since no // constant would be in bounds after the JIT's 24/16 shift. auto Filter = [&IROp](uint64_t X) { - return ARMEmitter::IsImmAddSub(X) && IROp->Size >= 4; + return ARMEmitter::IsImmAddSub(X) && IROp->Size >= OpSize::i32Bit; }; return InlineIf(IREmit, CurrentIR, CodeNode, IROp, Index, Filter); @@ -112,7 +112,7 @@ class ConstProp final : public FEXCore::IR::Pass { IsSIMM9 &= (SupportsTSOImm9 || !TSO); // Extended offsets for regular loadstore only. - bool IsExtended = (Imm & (IROp->Size - 1)) == 0 && Imm / IROp->Size <= 4095; + bool IsExtended = (Imm & (IR::OpSizeToSize(IROp->Size) - 1)) == 0 && Imm / IR::OpSizeToSize(IROp->Size) <= 4095; IsExtended &= !TSO; if (IsSIMM9 || IsExtended) { @@ -204,7 +204,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current /* IsImmAddSub assumes the constants are sign-extended, take care of that * here so we get the optimization for 32-bit adds too. */ - if (Op->Header.Size == 4) { + if (Op->Header.Size == OpSize::i32Bit) { Constant1 = (int64_t)(int32_t)Constant1; Constant2 = (int64_t)(int32_t)Constant2; } @@ -290,12 +290,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current } if (!Replaced) { - InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); + InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); }); } break; } case OP_OR: { - InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); + InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); }); break; } case OP_XOR: { @@ -325,7 +325,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current } if (!Replaced) { - InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); + InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); }); } } break; @@ -333,7 +333,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current case OP_ANDWITHFLAGS: case OP_ANDN: case OP_TESTNZ: { - InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); + InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IR::OpSizeAsBits(IROp->Size)); }); break; } case OP_NEG: { @@ -356,7 +356,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) { // Shifts mask the shift amount by 63 or 31 depending on operating size; - uint64_t ShiftMask = IROp->Size == 8 ? 63 : 31; + uint64_t ShiftMask = IROp->Size == OpSize::i64Bit ? 63 : 31; uint64_t NewConstant = (Constant1 << (Constant2 & ShiftMask)) & getMask(IROp); IREmit->ReplaceWithConstant(CodeNode, NewConstant); } else if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && Constant2 == 0) { @@ -384,7 +384,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current auto Op = IROp->C(); uint64_t Constant; - if (IROp->Size <= 8 && IREmit->IsValueConstant(Op->Src, &Constant)) { + if (IROp->Size <= OpSize::i64Bit && IREmit->IsValueConstant(Op->Src, &Constant)) { uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1); SourceMask <<= Op->lsb; @@ -400,7 +400,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current if (IREmit->IsValueConstant(Op->Src, &Constant)) { // SBFE of a constant can be converted to a constant. uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1); - uint64_t DestSizeInBits = IROp->Size * 8; + uint64_t DestSizeInBits = IR::OpSizeAsBits(IROp->Size); uint64_t DestMask = DestSizeInBits == 64 ? ~0ULL : ((1ULL << DestSizeInBits) - 1); SourceMask <<= Op->lsb; @@ -424,11 +424,11 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current uint64_t NewConstant = SourceMask << Op->lsb; if (ConstantSrc & 1) { - auto orr = IREmit->_Or(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant)); + auto orr = IREmit->_Or(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant)); IREmit->ReplaceAllUsesWith(CodeNode, orr); } else { // We are wanting to clear the bitfield. - auto andn = IREmit->_Andn(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant)); + auto andn = IREmit->_Andn(IROp->Size, CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(NewConstant)); IREmit->ReplaceAllUsesWith(CodeNode, andn); } } @@ -596,7 +596,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current case OP_SELECT: { InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, ARMEmitter::IsImmAddSub); - uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; + uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; uint64_t Constant2 {}; uint64_t Constant3 {}; @@ -614,7 +614,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current // We always allow source 1 to be zero, but source 0 can only be a // special 1/~0 constant if source 1 is 0. if (InlineIfZero(IREmit, CurrentIR, CodeNode, IROp, 1)) { - uint64_t AllOnes = IROp->Size == 8 ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; + uint64_t AllOnes = IROp->Size == OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; InlineIf(IREmit, CurrentIR, CodeNode, IROp, 0, [&AllOnes](uint64_t X) { return X == 1 || X == AllOnes; }); } break; @@ -632,7 +632,7 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current auto EO = NewRIP->C(); IREmit->SetWriteCursor(CurrentIR.GetNode(Op->NewRIP)); - IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(IR::SizeToOpSize(EO->Header.Size), EO->Offset)); + IREmit->ReplaceNodeArgument(CodeNode, 0, IREmit->_InlineEntrypointOffset(EO->Header.Size, EO->Offset)); } } break; diff --git a/FEXCore/Source/Interface/IR/Passes/IRValidation.cpp b/FEXCore/Source/Interface/IR/Passes/IRValidation.cpp index 082c8f59bb..244e0026a7 100644 --- a/FEXCore/Source/Interface/IR/Passes/IRValidation.cpp +++ b/FEXCore/Source/Interface/IR/Passes/IRValidation.cpp @@ -79,12 +79,12 @@ void IRValidation::Run(IREmitter* IREmit) { for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { const auto ID = CurrentIR.GetID(CodeNode); - const uint8_t OpSize = IROp->Size; + const auto OpSize = IROp->Size; if (GetHasDest(IROp->Op)) { - HadError |= OpSize == 0; + HadError |= OpSize == IR::OpSize::iInvalid; // Does the op have a destination of size 0? - if (OpSize == 0) { + if (OpSize == IR::OpSize::iInvalid) { Errors << "%" << ID << ": Had destination but with no size" << std::endl; } diff --git a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp index 379e725a8f..fb17894312 100644 --- a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp @@ -521,7 +521,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView& // Pattern match a branch fed by a compare. We could also handle bit tests // here, but tbz/tbnz has a limited offset range which we don't have a way to // deal with yet. Let's hope that's not a big deal. - if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < 4)) { + if (!(Op->Cond == COND_NEQ || Op->Cond == COND_EQ) || (Prev->Size < OpSize::i32Bit)) { return; } @@ -534,7 +534,7 @@ void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView& IREmit->ReplaceNodeArgument(CodeNode, 0, CurrentIR.GetNode(Prev->Args[0])); IREmit->ReplaceNodeArgument(CodeNode, 1, CurrentIR.GetNode(Prev->Args[1])); Op->FromNZCV = false; - Op->CompareSize = IR::SizeToOpSize(Prev->Size); + Op->CompareSize = Prev->Size; } else { return; } @@ -612,7 +612,7 @@ bool DeadFlagCalculationEliminination::ProcessBlock(IREmitter* IREmit, IRListVie // this flag is outside of the if, since the TestNZ might result from // optimizing AndWithFlags, and we need to converge locally in a single // iteration. - if (IROp->Op == OP_TESTNZ && IROp->Size < 4 && !(FlagsRead & (FLAG_N | FLAG_C))) { + if (IROp->Op == OP_TESTNZ && IROp->Size < OpSize::i32Bit && !(FlagsRead & (FLAG_N | FLAG_C))) { IROp->Op = OP_TESTZ; } diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 92d86cc120..b93fcde297 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -582,7 +582,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { if (Reg.Class == FPRFixedClass) { IROp_Header* Header = IR->GetOp(Old); - Copy = IREmit->_VMov(IR::SizeToOpSize(Header->Size), Map(Old)); + Copy = IREmit->_VMov(Header->Size, Map(Old)); } else { Copy = IREmit->_Copy(Map(Old)); } diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 4e542a9f5e..f342006530 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -731,7 +731,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { } else { auto* SourceNode = CurrentIR.GetNode(Op->X80Src); auto* OriginalNode = CurrentIR.GetNode(Op->OriginalValue); - StackData.push(StackMemberInfo {SourceNode, OriginalNode, SizeToOpSize(Op->LoadSize), Op->Float}); + StackData.push(StackMemberInfo {SourceNode, OriginalNode, Op->LoadSize, Op->Float}); } break; } @@ -793,7 +793,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { // or similar. As long as the source size and dest size are one and the same. // This will avoid any conversions between source and stack element size and conversion back. if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) { - IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, Value->Source->second); + IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second); } else { if (ReducedPrecisionMode) { switch (Op->StoreSize) { @@ -826,7 +826,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8)); IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit); } else { - IREmit->_StoreMem(FPRClass, IR::SizeToOpSize(Op->StoreSize), AddrNode, StackNode); + IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode); } } }