Skip to content

Commit

Permalink
Merge pull request #4125 from Sonicadvance1/unify_pshuflhw
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Unify PSHUF{L,H}W implementations
  • Loading branch information
lioncash authored Oct 21, 2024
2 parents c4306f2 + fafc04a commit 5d1fda7
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 229 deletions.
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ class OpDispatchBuilder final : public IREmitter {
void PUNPCKLOp(OpcodeArgs, size_t ElementSize);
void PUNPCKHOp(OpcodeArgs, size_t ElementSize);
void PSHUFBOp(OpcodeArgs);
Ref PShufWLane(size_t Size, FEXCore::IR::IndexNamedVectorConstant IndexConstant, bool LowLane, Ref IncomingLane, uint8_t Shuffle);
void PSHUFWOp(OpcodeArgs, bool Low);
void PSHUFW8ByteOp(OpcodeArgs);
void PSHUFDOp(OpcodeArgs);
Expand Down Expand Up @@ -1061,8 +1062,7 @@ class OpDispatchBuilder final : public IREmitter {
void AVX128_VDPP(OpcodeArgs);
void AVX128_VPERMQ(OpcodeArgs);

template<size_t ElementSize, bool Low>
void AVX128_VPSHUF(OpcodeArgs);
void AVX128_VPSHUFW(OpcodeArgs, bool Low);

template<size_t ElementSize>
void AVX128_VSHUF(OpcodeArgs);
Expand Down
30 changes: 18 additions & 12 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},

{OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::AVX128_VPERMILImm<4>},
{OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::AVX128_VPSHUF<2, false>},
{OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::AVX128_VPSHUF<2, true>},
{OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, false>},
{OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, true>},

{OPD(1, 0b01, 0x74), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, 1>},
{OPD(1, 0b01, 0x75), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, 2>},
Expand Down Expand Up @@ -1871,20 +1871,26 @@ void OpDispatchBuilder::AVX128_VPERMQ(OpcodeArgs) {
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}

template<size_t ElementSize, bool Low>
void OpDispatchBuilder::AVX128_VPSHUF(OpcodeArgs) {
void OpDispatchBuilder::AVX128_VPSHUFW(OpcodeArgs, bool Low) {
auto Shuffle = Op->Src[1].Literal();

AVX128_VectorUnaryImpl(Op, GetSrcSize(Op), ElementSize, [this, Shuffle](size_t _, Ref Src) {
Ref Result = Src;
const size_t BaseElement = Low ? 0 : 4;
struct DataPacking {
OpDispatchBuilder* This;
uint8_t Shuffle;
bool Low;
};

for (size_t i = 0; i < 4; i++) {
const auto Index = (Shuffle >> (2 * i)) & 0b11;
Result = _VInsElement(OpSize::i128Bit, ElementSize, BaseElement + i, BaseElement + Index, Result, Src);
}
DataPacking Pack {
.This = this,
.Shuffle = static_cast<uint8_t>(Shuffle),
.Low = Low,
};

AVX128_VectorUnaryImpl(Op, GetSrcSize(Op), OpSize::i16Bit, [Pack](size_t _, Ref Src) {
const auto IndexedVectorConstant = Pack.Low ? FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW :
FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW;

return Result;
return Pack.This->PShufWLane(OpSize::i128Bit, IndexedVectorConstant, Pack.Low, Src, Pack.Shuffle);
});
}

Expand Down
88 changes: 33 additions & 55 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,87 +890,65 @@ void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
StoreResult(FPRClass, Op, Result, -1);
}

void OpDispatchBuilder::PSHUFW8ByteOp(OpcodeArgs) {
Ref OpDispatchBuilder::PShufWLane(size_t Size, FEXCore::IR::IndexNamedVectorConstant IndexConstant, bool LowLane, Ref IncomingLane, uint8_t Shuffle) {
constexpr auto IdentityCopy = 0b11'10'01'00;

uint16_t Shuffle = Op->Src[1].Data.Literal.Value;
const auto Size = GetSrcSize(Op);
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Dest {};
const bool Is128BitLane = Size == OpSize::i128Bit;
const uint8_t NumElements = Size / 2;
const uint8_t HalfNumElements = NumElements >> 1;

// TODO: There can be more optimized copies here.
switch (Shuffle) {
case IdentityCopy: {
// Special case identity copy.
Dest = Src;
break;
return IncomingLane;
}
case 0b00'00'00'00:
case 0b01'01'01'01:
case 0b10'10'10'10:
case 0b11'11'11'11: {
// Special case element duplicate and broadcasts.
Dest = _VDupElement(Size, 2, Src, (Shuffle & 0b11));
break;
// Special case element duplicate and broadcast to low or high 64-bits.
Ref Dup = _VDupElement(Size, OpSize::i16Bit, IncomingLane, (LowLane ? 0 : HalfNumElements) + (Shuffle & 0b11));
if (Is128BitLane) {
if (LowLane) {
// DUP goes low.
// Source goes high.
Dup = _VTrn2(Size, OpSize::i64Bit, Dup, IncomingLane);
} else {
// DUP goes high.
// Source goes low.
Dup = _VTrn(Size, OpSize::i64Bit, IncomingLane, Dup);
}
}

return Dup;
}
default: {
// PSHUFLW needs to scale index by 16.
// PSHUFHW needs to scale index by 16.
// PSHUFW (mmx) also needs to scale by 16 to get correct low element.
auto LookupIndexes =
LoadAndCacheIndexedNamedVectorConstant(Size, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW, Shuffle * 16);
Dest = _VTBL1(Size, Src, LookupIndexes);
break;
auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(Size, IndexConstant, Shuffle * 16);
return _VTBL1(Size, IncomingLane, LookupIndexes);
}
}
}

void OpDispatchBuilder::PSHUFW8ByteOp(OpcodeArgs) {
uint16_t Shuffle = Op->Src[1].Data.Literal.Value;
const auto Size = GetSrcSize(Op);
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Dest = PShufWLane(Size, FEXCore::IR::INDEXED_NAMED_VECTOR_PSHUFLW, true, Src, Shuffle);
StoreResult(FPRClass, Op, Dest, -1);
}

void OpDispatchBuilder::PSHUFWOp(OpcodeArgs, bool Low) {
constexpr auto IdentityCopy = 0b11'10'01'00;

uint16_t Shuffle = Op->Src[1].Data.Literal.Value;
const auto Size = GetSrcSize(Op);
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Dest {};
const auto IndexedVectorConstant = Low ? FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW :
FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW;

const uint8_t NumElements = Size / 2;
const uint8_t HalfNumElements = NumElements >> 1;

// TODO: There can be more optimized copies here.
switch (Shuffle) {
case IdentityCopy: {
// Special case identity copy.
Dest = Src;
break;
}
case 0b00'00'00'00:
case 0b01'01'01'01:
case 0b10'10'10'10:
case 0b11'11'11'11: {
// Special case element duplicate and broadcast to low or high 64-bits.
auto DUP = _VDupElement(Size, 2, Src, (Low ? 0 : HalfNumElements) + (Shuffle & 0b11));
if (Low) {
// DUP goes low.
// Source goes high.
Dest = _VTrn2(Size, 8, DUP, Src);
} else {
// DUP goes high.
// Source goes low.
Dest = _VTrn(Size, 8, Src, DUP);
}
break;
}
default: {
// PSHUFLW needs to scale index by 16.
// PSHUFHW needs to scale index by 16.
// PSHUFW (mmx) also needs to scale by 16 to get correct low element.
const auto IndexedVectorConstant = Low ? FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW :
FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW;
auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(Size, IndexedVectorConstant, Shuffle * 16);
Dest = _VTBL1(Size, Src, LookupIndexes);
break;
}
}
Ref Dest = PShufWLane(Size, IndexedVectorConstant, Low, Src, Shuffle);

StoreResult(FPRClass, Op, Dest, -1);
}
Expand Down
Loading

0 comments on commit 5d1fda7

Please sign in to comment.