diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 7c8d0554a0..07f4633609 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -857,7 +857,7 @@ void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) { ///< VMOVLPS/PD xmm1, xmm2, mem64 // Bits[63:0] come from Src2[63:0] // Bits[127:64] come from Src1[127:64] - auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false}); + auto Src2 = MakeSegmentAddress(Op, Op->Src[1]); Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 0, Src2); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); @@ -879,11 +879,11 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) { if (!Op->Dest.IsGPR()) { ///< VMOVHPS/PD mem64, xmm1 // Need to store Bits[127:64]. Use a vector element store. - auto Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, OpSize::i64Bit, Op->Flags, {.LoadData = false}); + auto Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Dest); } else if (!Op->Src[1].IsGPR()) { ///< VMOVHPS/PD xmm2, xmm1, mem64 - auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false}); + auto Src2 = MakeSegmentAddress(Op, Op->Src[1]); // Bits[63:0] come from Src1[63:0] // Bits[127:64] come from Src2[63:0] @@ -1236,7 +1236,7 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) { } // If we are storing to memory then we store the size of the element extracted - Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); + Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src.Low, Index, Dest); } @@ -1386,7 +1386,7 @@ void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, size_t ElementSize, const X Result.Low = _VInsGPR(OpSize::i128Bit, ElementSize, Index, Src1.Low, Src2); } else { // If loading from memory then we only load the element size - auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, ElementSize, Op->Flags, {.LoadData = false}); + auto Src2 = MakeSegmentAddress(Op, Src2Op); Result.Low = _VLoadVectorElement(OpSize::i128Bit, ElementSize, Src1.Low, Index, Src2); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 7e989554a4..4e1eff5088 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -106,7 +106,7 @@ void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) { } else { // If the destination is a GPR then the source is memory // xmm1[127:64] = src - Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.LoadData = false}); + Ref Src = MakeSegmentAddress(Op, Op->Src[0]); Ref Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, 16, Op->Flags); auto Result = _VLoadVectorElement(16, 8, Dest, 1, Src); StoreResult(FPRClass, Op, Result, -1); @@ -115,7 +115,7 @@ void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) { // In this case memory is the destination and the high bits of the XMM are source // Mem64 = xmm1[127:64] Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, 8, Op->Flags, {.LoadData = false}); + Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(16, 8, Src, 1, Dest); } } @@ -144,7 +144,7 @@ void OpDispatchBuilder::MOVLPOp(OpcodeArgs) { StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, 16, 16); } else { auto DstSize = GetDstSize(Op); - Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.Align = 8, .LoadData = false}); + Ref Src = MakeSegmentAddress(Op, Op->Src[0]); Ref Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags); auto Result = _VLoadVectorElement(16, 8, Dest, 0, Src); StoreResult(FPRClass, Op, Result, -1); @@ -1515,7 +1515,7 @@ Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, size_t ElementSize, const X86Tabl } // If loading from memory then we only load the element size - auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, ElementSize, Op->Flags, {.LoadData = false}); + Ref Src2 = MakeSegmentAddress(Op, Src2Op); return _VLoadVectorElement(Size, ElementSize, Src1, Index, Src2); } @@ -1639,7 +1639,7 @@ void OpDispatchBuilder::PExtrOp(OpcodeArgs, size_t ElementSize) { } // If we are storing to memory then we store the size of the element extracted - Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); + Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(16, OverridenElementSize, Src, Index, Dest); } diff --git a/unittests/ASM/FEX_bugs/tls_vector_element.asm b/unittests/ASM/FEX_bugs/tls_vector_element.asm new file mode 100644 index 0000000000..5bd57082a4 --- /dev/null +++ b/unittests/ASM/FEX_bugs/tls_vector_element.asm @@ -0,0 +1,138 @@ +%ifdef CONFIG +{ + "RegData": { + "RAX": "0x4142434445464748", + "RBX": "0x4142434445464748", + "RCX": "0x0000000000000056", + "RDX": "0x4142434445464748", + "RSI": "0x0000000000004748", + "RDI": "0x4142434445464748", + "RSP": "0x0000000045464748", + "RBP": "0x6162636465666768", + "R8": "0x4142434445464748", + "R9": "0x4142434445464748", + "R10": "0x0000000000000056", + "R11": "0x4142434445464748", + "R12": "0x4142434445464748", + "R13": "0x0000000000004748", + "R14": "0x0000000045464748", + "R15": "0x6162636465666768", + "XMM0": ["0x5152535455565758", "0x4142434445464748"], + "XMM1": ["0x4142434445464748", "0x6162636465666768"], + "XMM2": ["0x5152535455564858", "0x6162636465666768"], + "XMM3": ["0x5152535455565758", "0x4142434445464748"], + "XMM4": ["0x4142434445464748", "0x6162636465666768"], + "XMM5": ["0x4546474855565758", "0x6162636465666768"], + "XMM6": ["0x5152535455565758", "0x4142434445464748"], + "XMM7": ["0x5152535447485758", "0x6162636465666768"], + "XMM8": ["0x5152535455565758", "0x4142434445464748"], + "XMM9": ["0x4142434445464748", "0x4142434445464748"], + "XMM10": ["0x5152535455564858", "0x6162636465666768"], + "XMM11": ["0x5152535455565758", "0x4142434445464748"], + "XMM12": ["0x4142434445464748", "0x4142434445464748"], + "XMM13": ["0x4546474855565758", "0x6162636465666768"], + "XMM14": ["0x5152535455565758", "0x4142434445464748"], + "XMM15": ["0x5152535447485758", "0x6162636465666768"] + } +} +%endif + +; FEX-Emu had a bug where TLS vector element loadstores weren't correctly prefixing the segment on the address. +; This caused a crash in the game Halls of Torment (steamid 2218750) where it had some TLS vector data loaded with movhps. +; This tests all the vector element loadstores that FEX had missed. + +; Setup TLS segment +mov rax, 0xe000_0000 +wrgsbase rax + +movups xmm0, [rel .data_setup] +movups xmm1, [rel .data_setup] +movups xmm2, [rel .data_setup] +movups xmm3, [rel .data_setup] +movups xmm4, [rel .data_setup] +movups xmm5, [rel .data_setup] +movups xmm6, [rel .data_setup] +movups xmm7, [rel .data_setup] +movups xmm8, [rel .data_setup] +movups xmm9, [rel .data_setup] +movups xmm10, [rel .data_setup] +movups xmm11, [rel .data_setup] +movups xmm12, [rel .data_setup] +movups xmm13, [rel .data_setup] +movups xmm14, [rel .data_setup] +movups xmm15, [rel .data_setup] + +mov rax, [rel .data] +mov [gs:0], rax + +jmp .test +.test: + +; SSE loads +movhps xmm0, [gs:0] +movlps xmm1, [gs:0] +pinsrb xmm2, [gs:0], 1 +movhpd xmm3, [gs:0] +movlpd xmm4, [gs:0] +pinsrd xmm5, [gs:0], 1 +pinsrq xmm6, [gs:0], 1 +pinsrw xmm7, [gs:0], 1 + +; AVX loads +vmovhps xmm8, xmm0, [gs:0] +vmovlps xmm9, xmm0, [gs:0] +vpinsrb xmm10, [gs:0], 1 +vmovhpd xmm11, xmm0, [gs:0] +vmovlpd xmm12, xmm0, [gs:0] +vpinsrd xmm13, [gs:0], 1 +vpinsrq xmm14, [gs:0], 1 +vpinsrw xmm15, [gs:0], 1 + +; SSE stores +movhps [gs:(0 * 8)], xmm0 +movlps [gs:(1 * 8)], xmm1 +pextrb [gs:(2 * 8)], xmm2, 2 +movhpd [gs:(3 * 8)], xmm3 +movlpd [gs:(4 * 8)], xmm4 +pextrw [gs:(5 * 8)], xmm5, 2 +pextrd [gs:(6 * 8)], xmm6, 2 +pextrq [gs:(7 * 8)], xmm7, 1 + +; AVX stores +vmovhps [gs:(8 * 8)], xmm0 +vmovlps [gs:(9 * 8)], xmm1 +vpextrb [gs:(10 * 8)], xmm2, 2 +vmovhpd [gs:(11 * 8)], xmm3 +vmovlpd [gs:(12 * 8)], xmm4 +vpextrw [gs:(13 * 8)], xmm5, 2 +vpextrd [gs:(14 * 8)], xmm6, 2 +vpextrq [gs:(15 * 8)], xmm7, 1 + +; Load the results back in to GPRs +mov rax, [gs:(0 * 8)] +mov rbx, [gs:(1 * 8)] +mov rcx, [gs:(2 * 8)] +mov rdx, [gs:(3 * 8)] +mov rdi, [gs:(4 * 8)] +mov rsi, [gs:(5 * 8)] +mov rsp, [gs:(6 * 8)] +mov rbp, [gs:(7 * 8)] +mov r8, [gs:(8 * 8)] +mov r9, [gs:(9 * 8)] +mov r10, [gs:(10 * 8)] +mov r11, [gs:(11 * 8)] +mov r12, [gs:(12 * 8)] +mov r13, [gs:(13 * 8)] +mov r14, [gs:(14 * 8)] +mov r15, [gs:(15 * 8)] + +hlt + +.data: +dq 0x4142434445464748 + +.data_setup: +dq 0x5152535455565758, 0x6162636465666768 + +.data_result: +dq 0, 0, 0, 0, 0, 0, 0, 0