From 8ec6187dc7657d833cac446cf007f49b71705e3a Mon Sep 17 00:00:00 2001 From: Elad Ashkenazi <18193363+elad335@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:52:03 +0300 Subject: [PATCH] SPU Analyzer: Fix support for multi-block value merge --- rpcs3/Emu/Cell/SPUCommonRecompiler.cpp | 215 ++++++++++++++++++------- rpcs3/Emu/Cell/SPURecompiler.h | 19 ++- 2 files changed, 177 insertions(+), 57 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 69e1ace6e4..a069ab9eb8 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -2627,18 +2627,18 @@ reg_state_t reg_state_t::downgrade() const { if (flag & vf::is_const) { - return reg_state_t{vf::is_mask, 0, umax, this->value, ~this->value}; + return reg_state_t{vf::is_mask, 0, umax, this->value, ~this->value, this->origin}; } if (!(flag - vf::is_null)) { - return reg_state_t{vf::is_mask, 0, this->tag, 0, 0}; + return reg_state_t{vf::is_mask, 0, this->tag, 0, 0, this->origin}; } return *this; } -reg_state_t reg_state_t::merge(const reg_state_t& rhs) const +reg_state_t reg_state_t::merge(const reg_state_t& rhs, u32 current_pc) const { if (rhs == *this) { @@ -2661,12 +2661,13 @@ reg_state_t reg_state_t::merge(const reg_state_t& rhs) const { // Success (create new value tag) res.tag = reg_state_t::alloc_tag(); + res.origin = current_pc; return res; } } } - return make_unknown(); + return make_unknown(current_pc); } reg_state_t reg_state_t::build_on_top_of(const reg_state_t& rhs) const @@ -2728,9 +2729,17 @@ u32 reg_state_t::alloc_tag(bool reset) noexcept return ++g_tls_tag; } +void reg_state_t::invalidate_if_created(u32 current_pc) +{ + if (!is_const() && origin == current_pc) + { + tag = reg_state_t::alloc_tag(); + } +} + // Converge 2 register states to the same flow in execution template -static void merge(std::array& result, const std::array& lhs, const std::array& rhs) +static void merge(std::array& result, const std::array& lhs, const std::array& rhs, u32 current_pc) { usz index = umax; @@ -2738,7 +2747,7 @@ static void merge(std::array& result, const std::array create(u32 pc) noexcept { - auto ptr = new block_reg_info{ pc, reg_state_t::make_unknown() }; + auto ptr = new block_reg_info{ pc, reg_state_t::make_unknown(pc) }; for (reg_state_t& f : ptr->local_state) { @@ -4882,7 +4891,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const bool should_search_patterns = target_count < 300u; // Treat start of function as an unknown value with tag (because it is) - const reg_state_t start_program_count = reg_state_t::make_unknown(); + const reg_state_t start_program_count = reg_state_t::make_unknown(entry_point - 1); // Initialize reg_state_it.emplace_back(entry_point); @@ -5375,10 +5384,20 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s vregs[reg] = reg_state_t::from_value(value); }; - const auto inherit_const_value = [&](u32 reg, bs_t flag, u32 value) + const auto inherit_const_value = [&](u32 reg, const reg_state_t& ra, const reg_state_t& rb, u32 value, u32 pos) { - flag -= vf::is_null; - vregs[reg] = reg_state_t{flag, value, flag & vf::is_const ? u32{umax} : reg_state_t::alloc_tag()}; + if (ra.origin != rb.origin) + { + pos = reg_state_it[wi].pc; + } + else + { + pos = ra.origin; + } + + const bs_t flag = (ra.flag & rb.flag) - vf::is_null; + + vregs[reg] = reg_state_t{flag, value, flag & vf::is_const ? u32{umax} : reg_state_t::alloc_tag(), 0, 0, pos}; }; const auto inherit_const_mask_value = [&](u32 reg, reg_state_t state, u32 mask_ones, u32 mask_zeroes) @@ -5407,12 +5426,12 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } ensure(state.tag != umax); - vregs[reg] = reg_state_t{vf::is_mask, 0, state.tag, ones, zeroes}; + vregs[reg] = reg_state_t{vf::is_mask, 0, state.tag, ones, zeroes, state.origin}; }; - const auto unconst = [&](u32 reg) + const auto unconst = [&](u32 reg, u32 pc) { - vregs[reg] = {{}, {}, reg_state_t::alloc_tag()}; + vregs[reg] = reg_state_t::make_unknown(pc); }; const auto add_block = [&](u32 target) @@ -5467,6 +5486,14 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } + if (atomic16.active) + { + for (auto state : {&atomic16.lsa, &atomic16.ls, &atomic16.ls_offs}) + { + state->invalidate_if_created(pos); + } + } + const u32 data = std::bit_cast>(::at32(result.data, (pos - lsa) / 4)); const auto op = spu_opcode_t{data}; const auto type = g_spu_itype.decode(data); @@ -5650,7 +5677,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } case MFC_Cmd: { - const auto [af, av, atagg, _3, _5] = get_reg(op.rt); + const auto [af, av, atagg, _3, _5, apc] = get_reg(op.rt); if (!is_pattern_match) { @@ -5908,14 +5935,25 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // Merge pattern attributes between different code paths, may cause detection of failures atomic16_t& existing = it->second; - if (existing.lsa_pc != atomic16.lsa_pc || existing.put_pc != atomic16.put_pc || existing.lsa != atomic16.lsa) + auto compare_tag_and_reg = [](std::pair a, std::pair b) + { + if (b.first->is_const() && a.first->is_const()) + { + return a.first->compare_with_mask_indifference(*b.first, SPU_LS_MASK_1); + } + + // Compare register source + return a.second == b.second; + }; + + if (existing.lsa_pc != atomic16.lsa_pc || existing.put_pc != atomic16.put_pc || !existing.lsa.compare_with_mask_indifference(atomic16.lsa, SPU_LS_MASK_128)) { // Register twice break_putllc16(22, atomic16.discard()); break_putllc16(22, existing.discard()); } - if (existing.active && existing.ls_access && atomic16.ls_access && (!existing.ls.compare_with_mask_indifference(atomic16.ls, SPU_LS_MASK_1) || existing.ls_offs != atomic16.ls_offs)) + if (existing.active && existing.ls_access && atomic16.ls_access && (!compare_tag_and_reg({&existing.ls, existing.reg}, {&atomic16.ls, atomic16.reg}) || existing.ls_offs != atomic16.ls_offs || existing.reg2 != atomic16.reg2)) { // Conflicting loads with stores in more than one code path break_putllc16(27, atomic16.set_invalid_ls(existing.ls_write || atomic16.ls_write)); @@ -5938,6 +5976,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { // Propagate LS access existing.ls = atomic16.ls; + existing.reg = atomic16.reg; + existing.reg2 = atomic16.reg2; existing.ls_offs = atomic16.ls_offs; } @@ -5989,7 +6029,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (invalidate) { - unconst(op.rt); + unconst(op.rt, pos); } break; @@ -6068,7 +6108,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } // Unconst - unconst(op.rt); + unconst(op.rt, pos); break; } @@ -6237,7 +6277,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } // Unconst - unconst(op.rt); + unconst(op.rt, pos); break; } case spu_itype::STQA: @@ -6291,7 +6331,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } // Unconst - unconst(op.rt); + unconst(op.rt, pos); break; } @@ -6371,14 +6411,14 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } // Unconst - unconst(op.rt); + unconst(op.rt, pos); break; } case spu_itype::HBR: { hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt); - const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [af, av, at, ao, az, apc] = get_reg(op.ra); hbr_tg = af & vf::is_const && !op.c ? av & 0x3fffc : -1; break; } @@ -6443,9 +6483,13 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); - inherit_const_value(op.rt, af & bf, bv | av); + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; + + inherit_const_value(op.rt, ra, rb, av | bv, pos); break; } case spu_itype::XORI: @@ -6456,8 +6500,11 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - inherit_const_value(op.rt, af, av ^ op.si10); + const auto ra = get_reg(op.ra); + + const auto [af, av, at, ao, az, apc] = ra; + + inherit_const_value(op.rt, ra, ra, av ^ op.si10, pos); break; } case spu_itype::XOR: @@ -6468,16 +6515,24 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); - inherit_const_value(op.rt, af & bf, bv ^ av); + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; + + inherit_const_value(op.rt, ra, rb, bv ^ av, pos); break; } case spu_itype::NOR: { - const auto [af, av, at, ao, az] = get_reg(op.ra); - const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); - inherit_const_value(op.rt, af & bf, ~(bv | av)); + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; + + inherit_const_value(op.rt, ra, rb, ~(bv | av), pos); break; } case spu_itype::ANDI: @@ -6494,9 +6549,13 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); - inherit_const_value(op.rt, af & bf, bv & av); + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; + + inherit_const_value(op.rt, ra, rb, bv & av, pos); break; } case spu_itype::AI: @@ -6508,9 +6567,9 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } const auto ra = get_reg(op.ra); - const auto [af, av, at, ao, az] = ra; + const auto [af, av, at, ao, az, apc] = ra; - inherit_const_value(op.rt, af, av + op.si10); + inherit_const_value(op.rt, ra, ra, av + op.si10, pos); if (u32 mask = ra.get_known_zeroes() & ~op.si10; mask & 1) { @@ -6525,10 +6584,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const auto ra = get_reg(op.ra); const auto rb = get_reg(op.rb); - const auto [af, av, at, ao, az] = ra; - const auto [bf, bv, bt, bo, bz] = rb; + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; - inherit_const_value(op.rt, af & bf, bv + av); + inherit_const_value(op.rt, ra, rb, bv + av, pos); if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1) { @@ -6540,8 +6599,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } case spu_itype::SFI: { - const auto [af, av, at, ao, az] = get_reg(op.ra); - inherit_const_value(op.rt, af, op.si10 - av); + const auto ra = get_reg(op.ra); + const auto [af, av, at, ao, az, apc] = get_reg(op.ra); + + inherit_const_value(op.rt, ra, ra, op.si10 - av, pos); break; } case spu_itype::SF: @@ -6549,10 +6610,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const auto ra = get_reg(op.ra); const auto rb = get_reg(op.rb); - const auto [af, av, at, ao, az] = ra; - const auto [bf, bv, bt, bo, bz] = rb; + const auto [af, av, at, ao, az, apc] = ra; + const auto [bf, bv, bt, bo, bz, bpc] = rb; - inherit_const_value(op.rt, af & bf, bv - av); + inherit_const_value(op.rt, ra, rb, bv - av, pos); if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1) { @@ -6588,8 +6649,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - inherit_const_value(op.rt, af, av >> ((0 - op.i7) & 0x1f)); + const auto ra = get_reg(op.ra); + const auto [af, av, at, ao, az, apc] = get_reg(op.ra); + + inherit_const_value(op.rt, ra, ra, av >> ((0 - op.i7) & 0x1f), pos); break; } case spu_itype::SHLI: @@ -6606,8 +6669,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } - const auto [af, av, at, ao, az] = get_reg(op.ra); - inherit_const_value(op.rt, af, av << (op.i7 & 0x1f)); + const auto ra = get_reg(op.ra); + const auto [af, av, at, ao, az, apc] = ra; + + inherit_const_value(op.rt, ra, ra, av << (op.i7 & 0x1f), pos); break; } case spu_itype::SELB: @@ -6616,7 +6681,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const auto rb = get_reg(op.rb); // Ignore RC, perform a value merge which also respect bitwise information - vregs[op.rt4] = ra.merge(rb); + vregs[op.rt4] = ra.merge(rb, pos); break; } case spu_itype::SHLQBYI: @@ -6641,7 +6706,49 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (!(type & spu_itype::zregmod)) { const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; - unconst(op_rt); + + u32 ra = s_reg_max, rb = s_reg_max, rc = s_reg_max; + + if (m_use_ra.test(pos / 4)) + { + ra = op.ra; + } + + if (m_use_rb.test(pos / 4)) + { + rb = op.rb; + } + + if (type & spu_itype::_quadrop && m_use_rc.test(pos / 4)) + { + rc = op.rc; + } + + u32 reg_pos = SPU_LS_SIZE; + + for (u32 reg : {ra, rb, rc}) + { + if (reg != s_reg_max) + { + if (reg_pos == SPU_LS_SIZE) + { + reg = vregs[reg].origin; + } + else if (reg_pos != vregs[reg].origin) + { + const u32 block_start = reg_state_it[wi].pc; + + // if (vregs[reg].origin >= block_start && vregs[reg].origin <= pos) + // { + // reg_pos = std::max(vregs[reg].origin, reg_pos); + // } + reg_pos = block_start; + break; + } + } + } + + unconst(op_rt, reg_pos == SPU_LS_SIZE ? pos : reg_pos); } break; @@ -7714,7 +7821,7 @@ std::array& block_reg_info::evaluate_start_state(const s } else { - merge(res_state, res_state, *arg_state); + merge(res_state, res_state, *arg_state, it->block_pc); } } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 31e6022de0..c29a24c3f3 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -208,6 +208,7 @@ public: u32 tag = umax; u32 known_ones{}; u32 known_zeroes{}; + u32 origin = SPU_LS_SIZE; bool is_const() const; @@ -222,21 +223,33 @@ public: bool compare_with_mask_indifference(u32 imm, u32 mask_bits) const; bool unequal_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const; + // Convert constant-based value to mask-based value reg_state_t downgrade() const; - reg_state_t merge(const reg_state_t& rhs) const; + + // Connect two register states between different blocks + reg_state_t merge(const reg_state_t& rhs, u32 current_pc) const; + + // Override value with newer value if needed reg_state_t build_on_top_of(const reg_state_t& rhs) const; + // Get known zeroes mask u32 get_known_zeroes() const; + + // Get known ones mask u32 get_known_ones() const; + // Invalidate value if non-constant and reached the point in history of its creation + void invalidate_if_created(u32 current_pc); + template - static std::conditional_t> make_unknown() noexcept + static std::conditional_t> make_unknown(u32 pc) noexcept { if constexpr (Count == 1) { reg_state_t v{}; v.tag = alloc_tag(); v.flag = {}; + v.origin = pc; return v; } else @@ -245,7 +258,7 @@ public: for (reg_state_t& state : result) { - state = make_unknown<1>(); + state = make_unknown<1>(pc); } return result;