From ef5cffdead8cc94e0af2ffdab6af9b71715008c8 Mon Sep 17 00:00:00 2001 From: RipleyTom Date: Fri, 16 Feb 2024 22:59:07 +0100 Subject: [PATCH] Fixes --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 100 ++++++++++++++------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index ff9554c368e8..ff2705d69c35 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5637,18 +5637,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator }; // FM(a, re_accurate(div)) - if (const auto [ok_re_acc, div, one] = match_expr(b, re_accurate(match(), match())); ok_re_acc) + if (const auto [ok_re_acc, div] = match_expr(b, re_accurate(match())); ok_re_acc) { full_fm_accurate(a, div); - erase_stores(one, b); + erase_stores(b); return; } // FM(re_accurate(div), b) - if (const auto [ok_re_acc, div, one] = match_expr(a, re_accurate(match(), match())); ok_re_acc) + if (const auto [ok_re_acc, div] = match_expr(a, re_accurate(match())); ok_re_acc) { full_fm_accurate(b, div); - erase_stores(one, a); + erase_stores(a); return; } } @@ -5973,10 +5973,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return llvm_calli{"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}.set_order_equality_hint(1, 1, 0); } - template - static llvm_calli re_accurate(T&& a, U&& b) + template + static llvm_calli re_accurate(T&& a) { - return {"spu_re_acc", {std::forward(a), std::forward(b)}}; + return {"spu_re_acc", {std::forward(a)}}; } void FMA(spu_opcode_t op) @@ -5995,26 +5995,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto b = value(ci->getOperand(1)); const auto c = value(ci->getOperand(2)); - if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::approximate) - { - const auto ma = sext(fcmp_uno(a != fsplat(0.))); - const auto mb = sext(fcmp_uno(b != fsplat(0.))); - const auto ca = bitcast(bitcast(a) & mb); - const auto cb = bitcast(bitcast(b) & ma); - return fma32x4(eval(ca), eval(cb), c); - } - else - { - return fma32x4(a, b, c); - } + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + const auto ca = bitcast(bitcast(a) & mb); + const auto cb = bitcast(bitcast(b) & ma); + + return fma32x4(eval(ca), eval(cb), c); }); register_intrinsic("spu_re_acc", [&](llvm::CallInst* ci) { const auto div = value(ci->getOperand(0)); - const auto the_one = value(ci->getOperand(1)); - - const auto div_result = the_one / div; + const auto div_result = fsplat(1.0f) / div; // From ps3 hardware testing: Inf => NaN and NaN => Zero, Signed Zero => Zero // This results in full accuracy within 1ulp(Currently x86 seems to be rounding up?) @@ -6029,6 +6021,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return bitcast(((bitcast(div_result) & and_mask) & and_mask_zero) | or_mask); }); + constexpr f32 ONEISH = std::bit_cast(std::bit_cast(1.0f) + 1); + const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); static const auto MT = match(); const auto full_expr = fma(a, b, c); @@ -6056,57 +6050,67 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (check_sqrt_pattern_for_float(1.0f)) return; - if (check_sqrt_pattern_for_float(std::bit_cast(std::bit_cast(1.0f) + 1))) + if (check_sqrt_pattern_for_float(ONEISH)) return; - auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool + // Full reciprocal patterns + // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) + if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) { - // FMA(FNMS(div <*> spu_re(div), float_value) <*> spu_re(div), spu_re(div)) - if (auto [ok_c, div] = match_expr(c, spu_re(MT)); ok_c) + auto check_accurate_reciprocal_pattern_for_float = [&](f32 float_value) -> bool { if (auto [ok_fma] = match_expr(full_expr, fma(fnms(div, c, fsplat(float_value)), c, c)); ok_fma) { erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(float_value))); + set_vr(op.rt4, re_accurate(div)); return true; } - } - - return false; - }; + return false; + }; - if (check_accurate_reciprocal_pattern_for_float(1.0f)) - return; + if (check_accurate_reciprocal_pattern_for_float(1.0f)) + return; - if (check_accurate_reciprocal_pattern_for_float(std::bit_cast(std::bit_cast(1.0f) + 1))) - return; + if (check_accurate_reciprocal_pattern_for_float(ONEISH)) + return; - // GOW 3(uses 1.0f * spu_re(div) instead of just spu_re(div) in the pattern) - if (auto [ok_fm, div] = match_expr(c, fm(spu_re(MT), fsplat(1.0f))); ok_fm) - { - if (auto [ok_fma] = match_expr(full_expr, fma(fnms(c, div, fsplat(1.0f)), spu_re(div), c)); ok_fma) + // Generate dynamic pattern for when float is unknown because of scope + if (auto [ok_fma, cursed_float] = match_expr(full_expr, fma(fnms(div, c, MT), c, c)); ok_fma) { erase_stores(a, b, c); - set_vr(op.rt4, re_accurate(div, fsplat(1.0f))); + const auto bitcast_float = bitcast(cursed_float); + set_vr(op.rt4, select(bitcast_float == splat(0x3F800000) | bitcast_float == splat(0x3F800001), re_accurate(div), fma(fnms(spu_re(div), div, cursed_float), spu_re(div), spu_re(div)))); return; } } - // NFS Most Wanted doesn't like this if (g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::relaxed) { - // Those patterns are not safe vs non optimization as inaccuracy from spu_re will spread with early fm before the accuracy is improved - // Match division (fast) - // FMA(FNMS(fm(diva<*> spu_re(divb)), divb, diva), spu_re(divb), fm(diva<*> spu_re(divb))) + // FMA(FNMS(fm(diva <*> spu_re(divb)), divb, diva), spu_re(divb), fm(diva <*> spu_re(divb))) + // NFS: Most Wanted doesn't like this pattern + + auto full_fast_div = [&](const auto& diva, const auto& divb) + { + const auto div_result = diva / divb; + const auto result_and = bitcast(div_result) & 0x7FFFFFFFu; + const auto result_cmp_inf = sext(result_and == splat(0x7F800000u)); + const auto result_cmp_nan = sext(result_and <= splat(0x7F800000u)); + const auto and_mask_zero = bitcast(sext(result_and != splat(0u))); + const auto and_mask = bitcast(result_cmp_nan) & splat(0xFFFFFFFFu); + const auto or_mask = bitcast(result_cmp_inf) & splat(0xFFFFFFFu); + const auto final_result = bitcast(((bitcast(div_result) & and_mask) & and_mask_zero) | or_mask); + set_vr(op.rt4, final_result); + }; + if (auto [ok_fnma, divb, diva] = match_expr(a, fnms(c, MT, MT)); ok_fnma) { if (auto [ok_fm, fm1, fm2] = match_expr(c, fm(MT, MT)); ok_fm && ((fm1.eq(diva) && fm2.eq(b)) || (fm1.eq(b) && fm2.eq(diva)))) { if (auto [ok_re] = match_expr(b, spu_re(divb)); ok_re) { - erase_stores(b, c); - set_vr(op.rt4, diva / divb); + erase_stores(a, b, c); + full_fast_div(diva, divb); return; } } @@ -6119,8 +6123,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { if (auto [ok_re] = match_expr(a, spu_re(divb)); ok_re) { - erase_stores(a, c); - set_vr(op.rt4, diva / divb); + erase_stores(a, b, c); + full_fast_div(diva, divb); return; } }