Skip to content

Commit

Permalink
[X86] Improve KnownBits for X86ISD::PSADBW nodes (llvm#83830)
Browse files Browse the repository at this point in the history
Don't just return the known zero upperbits, compute the absdiff Knownbits and perform the horizontal sum.

Add implementations that handle both the X86ISD::PSADBW nodes and the INTRINSIC_WO_CHAIN intrinsics (pre-legalization).
  • Loading branch information
RKSimon authored Mar 6, 2024
1 parent c371ee9 commit 0bd9255
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 71 deletions.
46 changes: 42 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36738,6 +36738,26 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
return TLO.CombineTo(Op, NewOp);
}

static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG, unsigned Depth) {
KnownBits Known2;
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
Known = KnownBits::absdiff(Known, Known2).zext(16);
// Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
Known, Known);
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
Known, Known);
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
Known, Known);
Known = Known.zext(64);
}

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
Expand Down Expand Up @@ -36887,12 +36907,13 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
}
case X86ISD::PSADBW: {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
assert(VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
LHS.getValueType() == RHS.getValueType() &&
LHS.getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types");

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
Known.Zero.setBitsFrom(16);
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
break;
}
case X86ISD::PCMPGT:
Expand Down Expand Up @@ -37046,6 +37067,23 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
case ISD::INTRINSIC_WO_CHAIN: {
switch (Op->getConstantOperandVal(0)) {
case Intrinsic::x86_sse2_psad_bw:
case Intrinsic::x86_avx2_psad_bw:
case Intrinsic::x86_avx512_psad_bw_512: {
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
assert(VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() &&
LHS.getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types");
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
break;
}
}
break;
}
}

// Handle target shuffles.
Expand Down
72 changes: 18 additions & 54 deletions llvm/test/CodeGen/X86/psadbw.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,14 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: xorps %xmm0, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: retq
;
Expand All @@ -82,96 +75,67 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
ret <2 x i64> %ext
}

; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
; X86-SSE-NEXT: fstpl (%esp)
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
; X64-SSE-NEXT: movd %xmm1, %eax
; X64-SSE-NEXT: xorps %xmm0, %xmm0
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-SSE-NEXT: movd %xmm1, %eax
; X64-SSE-NEXT: xorps %xmm1, %xmm1
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX2-NEXT: retq
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
%cvt = sitofp <2 x i64> %sad to <2 x double>
ret <2 x double> %cvt
}

; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE-NEXT: addpd %xmm1, %xmm0
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE-NEXT: addpd %xmm1, %xmm0
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
; AVX2-NEXT: # xmm1 = mem[0,0]
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX2-NEXT: retq
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
Expand Down
16 changes: 3 additions & 13 deletions llvm/test/CodeGen/X86/sad.ll
Original file line number Diff line number Diff line change
Expand Up @@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_unroll_nonzero_initial:
Expand Down Expand Up @@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_double_reduction:
Expand All @@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
bb:
Expand Down Expand Up @@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_double_reduction_abs:
Expand All @@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
bb:
Expand Down

0 comments on commit 0bd9255

Please sign in to comment.