Skip to content

Commit

Permalink
Merge pull request #4102 from Sonicadvance1/fix_vector_element_loadstore
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Fixes segment prefixing on vector element loadstores
  • Loading branch information
lioncash authored Oct 4, 2024
2 parents b89f5b8 + 407dc5d commit 2726f35
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 10 deletions.
10 changes: 5 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) {
///< VMOVLPS/PD xmm1, xmm2, mem64
// Bits[63:0] come from Src2[63:0]
// Bits[127:64] come from Src1[127:64]
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});
auto Src2 = MakeSegmentAddress(Op, Op->Src[1]);
Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 0, Src2);
Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);

Expand All @@ -879,11 +879,11 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) {
if (!Op->Dest.IsGPR()) {
///< VMOVHPS/PD mem64, xmm1
// Need to store Bits[127:64]. Use a vector element store.
auto Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, OpSize::i64Bit, Op->Flags, {.LoadData = false});
auto Dest = MakeSegmentAddress(Op, Op->Dest);
_VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Dest);
} else if (!Op->Src[1].IsGPR()) {
///< VMOVHPS/PD xmm2, xmm1, mem64
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], OpSize::i64Bit, Op->Flags, {.LoadData = false});
auto Src2 = MakeSegmentAddress(Op, Op->Src[1]);

// Bits[63:0] come from Src1[63:0]
// Bits[127:64] come from Src2[63:0]
Expand Down Expand Up @@ -1236,7 +1236,7 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) {
}

// If we are storing to memory then we store the size of the element extracted
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
Ref Dest = MakeSegmentAddress(Op, Op->Dest);
_VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src.Low, Index, Dest);
}

Expand Down Expand Up @@ -1386,7 +1386,7 @@ void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, size_t ElementSize, const X
Result.Low = _VInsGPR(OpSize::i128Bit, ElementSize, Index, Src1.Low, Src2);
} else {
// If loading from memory then we only load the element size
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, ElementSize, Op->Flags, {.LoadData = false});
auto Src2 = MakeSegmentAddress(Op, Src2Op);
Result.Low = _VLoadVectorElement(OpSize::i128Bit, ElementSize, Src1.Low, Index, Src2);
}

Expand Down
10 changes: 5 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) {
} else {
// If the destination is a GPR then the source is memory
// xmm1[127:64] = src
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.LoadData = false});
Ref Src = MakeSegmentAddress(Op, Op->Src[0]);
Ref Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, 16, Op->Flags);
auto Result = _VLoadVectorElement(16, 8, Dest, 1, Src);
StoreResult(FPRClass, Op, Result, -1);
Expand All @@ -115,7 +115,7 @@ void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) {
// In this case memory is the destination and the high bits of the XMM are source
// Mem64 = xmm1[127:64]
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Dest = LoadSource_WithOpSize(GPRClass, Op, Op->Dest, 8, Op->Flags, {.LoadData = false});
Ref Dest = MakeSegmentAddress(Op, Op->Dest);
_VStoreVectorElement(16, 8, Src, 1, Dest);
}
}
Expand Down Expand Up @@ -144,7 +144,7 @@ void OpDispatchBuilder::MOVLPOp(OpcodeArgs) {
StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, 16, 16);
} else {
auto DstSize = GetDstSize(Op);
Ref Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.Align = 8, .LoadData = false});
Ref Src = MakeSegmentAddress(Op, Op->Src[0]);
Ref Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags);
auto Result = _VLoadVectorElement(16, 8, Dest, 0, Src);
StoreResult(FPRClass, Op, Result, -1);
Expand Down Expand Up @@ -1515,7 +1515,7 @@ Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, size_t ElementSize, const X86Tabl
}

// If loading from memory then we only load the element size
auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, ElementSize, Op->Flags, {.LoadData = false});
Ref Src2 = MakeSegmentAddress(Op, Src2Op);
return _VLoadVectorElement(Size, ElementSize, Src1, Index, Src2);
}

Expand Down Expand Up @@ -1639,7 +1639,7 @@ void OpDispatchBuilder::PExtrOp(OpcodeArgs, size_t ElementSize) {
}

// If we are storing to memory then we store the size of the element extracted
Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
Ref Dest = MakeSegmentAddress(Op, Op->Dest);
_VStoreVectorElement(16, OverridenElementSize, Src, Index, Dest);
}

Expand Down
138 changes: 138 additions & 0 deletions unittests/ASM/FEX_bugs/tls_vector_element.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x4142434445464748",
"RBX": "0x4142434445464748",
"RCX": "0x0000000000000056",
"RDX": "0x4142434445464748",
"RSI": "0x0000000000004748",
"RDI": "0x4142434445464748",
"RSP": "0x0000000045464748",
"RBP": "0x6162636465666768",
"R8": "0x4142434445464748",
"R9": "0x4142434445464748",
"R10": "0x0000000000000056",
"R11": "0x4142434445464748",
"R12": "0x4142434445464748",
"R13": "0x0000000000004748",
"R14": "0x0000000045464748",
"R15": "0x6162636465666768",
"XMM0": ["0x5152535455565758", "0x4142434445464748"],
"XMM1": ["0x4142434445464748", "0x6162636465666768"],
"XMM2": ["0x5152535455564858", "0x6162636465666768"],
"XMM3": ["0x5152535455565758", "0x4142434445464748"],
"XMM4": ["0x4142434445464748", "0x6162636465666768"],
"XMM5": ["0x4546474855565758", "0x6162636465666768"],
"XMM6": ["0x5152535455565758", "0x4142434445464748"],
"XMM7": ["0x5152535447485758", "0x6162636465666768"],
"XMM8": ["0x5152535455565758", "0x4142434445464748"],
"XMM9": ["0x4142434445464748", "0x4142434445464748"],
"XMM10": ["0x5152535455564858", "0x6162636465666768"],
"XMM11": ["0x5152535455565758", "0x4142434445464748"],
"XMM12": ["0x4142434445464748", "0x4142434445464748"],
"XMM13": ["0x4546474855565758", "0x6162636465666768"],
"XMM14": ["0x5152535455565758", "0x4142434445464748"],
"XMM15": ["0x5152535447485758", "0x6162636465666768"]
}
}
%endif

; FEX-Emu had a bug where TLS vector element loadstores weren't correctly prefixing the segment on the address.
; This caused a crash in the game Halls of Torment (steamid 2218750) where it had some TLS vector data loaded with movhps.
; This tests all the vector element loadstores that FEX had missed.

; Setup TLS segment
mov rax, 0xe000_0000
wrgsbase rax

movups xmm0, [rel .data_setup]
movups xmm1, [rel .data_setup]
movups xmm2, [rel .data_setup]
movups xmm3, [rel .data_setup]
movups xmm4, [rel .data_setup]
movups xmm5, [rel .data_setup]
movups xmm6, [rel .data_setup]
movups xmm7, [rel .data_setup]
movups xmm8, [rel .data_setup]
movups xmm9, [rel .data_setup]
movups xmm10, [rel .data_setup]
movups xmm11, [rel .data_setup]
movups xmm12, [rel .data_setup]
movups xmm13, [rel .data_setup]
movups xmm14, [rel .data_setup]
movups xmm15, [rel .data_setup]

mov rax, [rel .data]
mov [gs:0], rax

jmp .test
.test:

; SSE loads
movhps xmm0, [gs:0]
movlps xmm1, [gs:0]
pinsrb xmm2, [gs:0], 1
movhpd xmm3, [gs:0]
movlpd xmm4, [gs:0]
pinsrd xmm5, [gs:0], 1
pinsrq xmm6, [gs:0], 1
pinsrw xmm7, [gs:0], 1

; AVX loads
vmovhps xmm8, xmm0, [gs:0]
vmovlps xmm9, xmm0, [gs:0]
vpinsrb xmm10, [gs:0], 1
vmovhpd xmm11, xmm0, [gs:0]
vmovlpd xmm12, xmm0, [gs:0]
vpinsrd xmm13, [gs:0], 1
vpinsrq xmm14, [gs:0], 1
vpinsrw xmm15, [gs:0], 1

; SSE stores
movhps [gs:(0 * 8)], xmm0
movlps [gs:(1 * 8)], xmm1
pextrb [gs:(2 * 8)], xmm2, 2
movhpd [gs:(3 * 8)], xmm3
movlpd [gs:(4 * 8)], xmm4
pextrw [gs:(5 * 8)], xmm5, 2
pextrd [gs:(6 * 8)], xmm6, 2
pextrq [gs:(7 * 8)], xmm7, 1

; AVX stores
vmovhps [gs:(8 * 8)], xmm0
vmovlps [gs:(9 * 8)], xmm1
vpextrb [gs:(10 * 8)], xmm2, 2
vmovhpd [gs:(11 * 8)], xmm3
vmovlpd [gs:(12 * 8)], xmm4
vpextrw [gs:(13 * 8)], xmm5, 2
vpextrd [gs:(14 * 8)], xmm6, 2
vpextrq [gs:(15 * 8)], xmm7, 1

; Load the results back in to GPRs
mov rax, [gs:(0 * 8)]
mov rbx, [gs:(1 * 8)]
mov rcx, [gs:(2 * 8)]
mov rdx, [gs:(3 * 8)]
mov rdi, [gs:(4 * 8)]
mov rsi, [gs:(5 * 8)]
mov rsp, [gs:(6 * 8)]
mov rbp, [gs:(7 * 8)]
mov r8, [gs:(8 * 8)]
mov r9, [gs:(9 * 8)]
mov r10, [gs:(10 * 8)]
mov r11, [gs:(11 * 8)]
mov r12, [gs:(12 * 8)]
mov r13, [gs:(13 * 8)]
mov r14, [gs:(14 * 8)]
mov r15, [gs:(15 * 8)]

hlt

.data:
dq 0x4142434445464748

.data_setup:
dq 0x5152535455565758, 0x6162636465666768

.data_result:
dq 0, 0, 0, 0, 0, 0, 0, 0

0 comments on commit 2726f35

Please sign in to comment.