diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index fbb3c89142..033210c1ef 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -2246,6 +2246,19 @@ void ppu_thread::cpu_on_stop() dump_all(ret); ppu_log.notice("thread context: %s", ret); } + + if (is_stopped()) + { + if (last_succ == 0 && last_fail == 0 && exec_bytes == 0) + { + perf_log.notice("PPU thread perf stats are not available."); + } + else + { + perf_log.notice("Perf stats for STCX reload: success %u, failure %u", last_succ, last_fail); + perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4); + } + } } void ppu_thread::exec_task() @@ -2287,8 +2300,6 @@ void ppu_thread::exec_task() ppu_thread::~ppu_thread() { - perf_log.notice("Perf stats for STCX reload: success %u, failure %u", last_succ, last_fail); - perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4); } ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u32 _prio, int detached) diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 313fddc59f..20e435d42d 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -29,6 +29,8 @@ const extern spu_decoder g_spu_itype; const extern spu_decoder g_spu_iname; const extern spu_decoder g_spu_iflag; +constexpr u32 s_reg_max = spu_recompiler_base::s_reg_max; + // Move 4 args for calling native function from a GHC calling convention function #if defined(ARCH_X64) static u8* move_args_ghc_to_native(u8* raw) @@ -1270,11 +1272,13 @@ spu_runtime::spu_runtime() return; } - fs::create_dir(m_cache_path + "llvm/"); - fs::remove_all(m_cache_path + "llvm/", false); - if (g_cfg.core.spu_debug) { + if (!fs::create_dir(m_cache_path + "llvm/")) + { + fs::remove_all(m_cache_path + "llvm/", false); + } + fs::file(m_cache_path + "spu.log", fs::rewrite); fs::file(m_cache_path + "spu-ir.log", fs::rewrite); } @@ -2479,6 +2483,329 @@ std::vector spu_thread::discover_functions(u32 base_addr, std::spanflag.all_of(to_test); +} + +bool reg_state_t::is_less_than(u32 imm) const +{ + if (flag & vf::is_const && value < imm) + { + return true; + } + + if (flag & vf::is_mask && ~known_zeroes < imm) + { + return true; + } + + return false; +} + +bool reg_state_t::operator==(const reg_state_t& r) const +{ + if ((flag ^ r.flag) - (vf::is_null + vf::is_mask)) + { + return false; + } + + return (flag & vf::is_const ? value == r.value : (tag == r.tag && known_ones == r.known_ones && known_zeroes == r.known_zeroes)); +} + +bool reg_state_t::operator==(u32 imm) const +{ + return flag == vf::is_const && value == imm; +} + +// Compare equality but try to ignore changes in unmasked bits +bool reg_state_t::compare_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const +{ + if (!mask_bits) + { + return true; + } + + if ((r.flag & flag) & vf::is_const) + { + // Simplified path for consts + if (((value ^ r.value) & mask_bits) == 0) + { + return true; + } + + return false; + } + + const bool is_equal = *this == r; + + if (is_equal) + { + return true; + } + + const auto _this = this->downgrade(); + const auto _r = r.downgrade(); + + const bool is_mask_equal = (_this.tag == _r.tag && _this.flag == _r.flag && !((_this.known_ones ^ _r.known_ones) & mask_bits) && !((_this.known_zeroes ^ _r.known_zeroes) & mask_bits)); + + return is_mask_equal; +} + +bool reg_state_t::compare_with_mask_indifference(u32 imm, u32 mask_bits) const +{ + if (!mask_bits) + { + return true; + } + + if (flag & vf::is_const) + { + if (((value ^ imm) & mask_bits) == 0) + { + return true; + } + } + + return false; +} + +// Ensure unequality but try to ignore changes in unmasked bits +bool reg_state_t::unequal_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const +{ + if (!mask_bits) + { + return true; + } + + if ((r.flag & flag) & vf::is_const) + { + // Simplified path for consts + if ((value ^ r.value) & mask_bits) + { + return true; + } + + return false; + } + + const bool is_equal = *this == r; + + if (is_equal) + { + return false; + } + + // Full path + const auto _this = this->downgrade(); + const auto _r = r.downgrade(); + + const bool is_base_value_equal = (_this.tag == _r.tag && _this.flag == _r.flag); + + if (!is_base_value_equal) + { + // Cannot ascertain unequality if the value origin is different + return false; + } + + // Find at least one bit that is known to be X state at value 'r', and known to be X^1 state at the objects' value + return (((_this.known_ones ^ _r.known_ones) & mask_bits) & ((_this.known_zeroes ^ _r.known_zeroes) & mask_bits)) != 0; +} + +reg_state_t reg_state_t::downgrade() const +{ + if (flag & vf::is_const) + { + return reg_state_t{vf::is_mask, 0, umax, this->value, ~this->value}; + } + + if (!(flag - vf::is_null)) + { + return reg_state_t{vf::is_mask, 0, this->tag, 0, 0}; + } + + return *this; +} + +reg_state_t reg_state_t::merge(const reg_state_t& rhs) const +{ + if (rhs == *this) + { + // Perfect state: no conflicts + return rhs; + } + + if ((rhs.flag + flag).all_of(vf::is_const + vf::is_mask)) + { + // Try to downgrade to a known-bits type value + const reg_state_t _rhs = rhs.downgrade(); + const reg_state_t _this = this->downgrade(); + + if ((_rhs.flag & _this.flag) & vf::is_mask) + { + // Now it is possible to merge the two values + reg_state_t res{vf::is_mask, 0, 0, _rhs.known_ones & _this.known_ones, _rhs.known_zeroes & _this.known_zeroes}; + + if (res.known_zeroes | res.known_ones) + { + // Success + res.tag = reg_state_t::alloc_tag(); + return res; + } + } + } + + return make_unknown(); +} + +reg_state_t reg_state_t::build_on_top_of(const reg_state_t& rhs) const +{ + if (flag & vf::is_null) + { + // Value unmodified + return rhs; + } + + if (rhs == *this) + { + // Perfect state: no conflicts + return rhs; + } + + return *this; +} + +u32 reg_state_t::get_known_zeroes() const +{ + if (flag & vf::is_const) + { + return ~value; + } + + return known_zeroes; +} + + +u32 reg_state_t::get_known_ones() const +{ + if (flag & vf::is_const) + { + return value; + } + + return known_ones; +} + +reg_state_t reg_state_t::from_value(u32 value) noexcept +{ + reg_state_t v{}; + v.value = value; + v.flag = vf::is_const; + return v; +} + +u32 reg_state_t::alloc_tag(bool reset) noexcept +{ + static thread_local u32 g_tls_tag = 0; + + if (reset) + { + g_tls_tag = 0; + return 0; + } + + return ++g_tls_tag; +} + +// Converge 2 register states to the same flow in execution +template +static void merge(std::array& result, const std::array& lhs, const std::array& rhs) +{ + usz index = umax; + + for (reg_state_t& state : result) + { + index++; + + state = lhs[index].merge(rhs[index]); + } +} + +// Override RHS state with the newer LHS state +template +static void build_on_top_of(std::array& result, const std::array& lhs, const std::array& rhs) +{ + usz index = umax; + + for (reg_state_t& state : result) + { + index++; + + state = lhs[index].build_on_top_of(rhs[index]); + } +} + +struct block_reg_info +{ + u32 pc = SPU_LS_SIZE; // Address + std::array local_state{}; + bool has_true_state = false; + std::array start_reg_state{}; + std::array end_reg_state{}; + std::array addend_reg_state{}; + std::array walkby_state{}; // State that is made by merging state_predecessor and iterating over instructions for final instrucion walk + + usz next_nodes_count = 0; + + struct node_t + { + u32 prev_pc = umax; + }; + + std::vector prev_nodes; + + static std::unique_ptr create(u32 pc) noexcept + { + auto ptr = new block_reg_info{ pc, reg_state_t::make_unknown() }; + + for (reg_state_t& f : ptr->local_state) + { + f.flag += vf::is_null; + } + + ptr->start_reg_state = ptr->local_state; + return std::unique_ptr(ptr); + } + + // Evaluate registers state + std::array& evaluate_start_state(const std::map>& map); + + // This function creates new node if not found and links the proceeding node to the old node + // In a manner in which no duplicate paths are formed + static void create_node(u32 pc_rhs, u32 parent_pc, std::map>& map) + { + //ensure(parent_node != pc_rhs); + ensure(map[parent_pc]); + + if (!map[pc_rhs]) + { + map[pc_rhs] = create(pc_rhs); + } + + node_t prev_node{parent_pc}; + map[parent_pc]->next_nodes_count++; + map[pc_rhs]->prev_nodes.emplace_back(prev_node); + } +}; + spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, std::map>* out_target_list) { // Result: addr + raw instruction data @@ -2508,22 +2835,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s m_bbs.clear(); m_chunks.clear(); m_funcs.clear(); - - // Value flags (TODO: only is_const is implemented) - enum class vf : u32 - { - is_const, - is_mask, - is_rel, - - __bitset_enum_max - }; - - // Weak constant propagation context (for guessing branch targets) - std::array, 128> vflags{}; - - // Associated constant values for 32-bit preferred slot - std::array values; + m_inst_attrs.clear(); + m_patterns.clear(); // SYNC instruction found bool sync = false; @@ -2539,6 +2852,12 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { } + // Weak constant propagation context (for guessing branch targets) + std::array, 128> vflags{}; + + // Associated constant values for 32-bit preferred slot + std::array values; + for (u32 wi = 0, wa = workload[0]; wi < workload.size();) { const auto next_block = [&] @@ -2613,7 +2932,6 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s case spu_itype::DFCMGT: case spu_itype::DFTSV: { - // Stop before invalid instructions (TODO) next_block(); continue; } @@ -3344,6 +3662,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } limit = std::min(limit, lsa + ::size32(result.data) * 4); + m_inst_attrs.resize(result.data.size()); // Cleanup block info for (u32 i = 0; i < workload.size(); i++) @@ -3445,6 +3764,1655 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } + struct putllc16_statistics_t + { + atomic_t all = 0; + atomic_t single = 0; + atomic_t nowrite = 0; + std::array, 128> breaking_reason{}; + + std::vector> get_reasons() + { + std::vector> map; + for (usz i = 0; i < breaking_reason.size(); i++) + { + if (u64 v = breaking_reason[i]) + { + map.emplace_back(i, v); + } + } + + std::stable_sort(map.begin(), map.end(), FN(x.second > y.second)); + return map; + } + }; + + struct atomic16_t + { + bool active = false; // GETLLAR happened + u32 lsa_pc = SPU_LS_SIZE; // PC of first LSA write + u32 lsa_last_pc = SPU_LS_SIZE; // PC of first LSA write + u32 put_pc = SPU_LS_SIZE; // PC of PUTLLC + reg_state_t ls{}; // state of LS load/store address register + reg_state_t ls_offs = reg_state_t::from_value(0); // Added value to ls + reg_state_t lsa{}; // state of LSA register on GETLLAR + reg_state_t ls_reg[8]{}; // stores/loads using register bundles with offset + reg_state_t ls_abs[8]{}; // stores/loads using absolute address + u32 reg = s_reg_max; // Source of address register of LS load/store + u32 reg2 = s_reg_max; // Source 2 of address register of LS load/store (STQX/LQX) + //u32 ls_offs[8]{}; // LS offset from register (0 if const) + bool ls_pc_rel = false; // For STQR/LQR + bool ls_access = false; // LS accessed + bool ls_write = false; // LS written + bool ls_invalid = false; // From this point and on, any store will cancel the optimization + bool select_16_or_0_at_runtime = false; + bool put_active = false; // PUTLLC happened + bool get_rdatomic = false; // True if MFC_RdAtomicStat was read after GETLLAR + u32 mem_count = 0; + + // Return old state for error reporting + atomic16_t discard() + { + const u32 pc = lsa_pc; + const u32 last_pc = lsa_last_pc; + + const atomic16_t old = *this; + *this = atomic16_t{}; + + // Keep some members + lsa_pc = pc; + lsa_last_pc = last_pc; + return old; + } + + // Conditional breakage (break if a full 128-byte reservation is needed) + atomic16_t set_invalid_ls(bool write) + { + ls_invalid = true; + ls_write |= write; + + if (ls_write) + { + return discard(); + } + + return atomic16_t{}; + } + }; + + // Reset tags + reg_state_t::alloc_tag(true); + + std::map> infos; + infos.emplace(entry_point, block_reg_info::create(entry_point)); + + struct block_reg_state_iterator + { + u32 pc{}; + usz parent_iterator_index = umax; + + // PUTLLC16 optimization analysis tracker + atomic16_t atomic16{}; + + block_reg_state_iterator(u32 _pc, usz _parent_iterator_index = umax) noexcept + : pc(_pc) + , parent_iterator_index(_parent_iterator_index) + { + } + }; + + std::vector> reg_state_it; + + std::map atomic16_all; // RdAtomicStat location -> atomic loop optimization state + std::map getllar_starts; // True for failed loops + std::map run_on_block; + + std::array* true_state_walkby = nullptr; + + atomic16_t dummy16{}; + + bool likely_putllc_loop = false; + + for (u32 i = 0, count = 0; i < result.data.size(); i++) + { + const u32 inst = std::bit_cast>(result.data[i]); + + if (spu_opcode_t{inst}.ra == MFC_RdAtomicStat && g_spu_itype.decode(inst) == spu_itype::RDCH) + { + count++; + + if (count == 2) + { + likely_putllc_loop = true; + break; + } + } + } + + usz target_count = 0; + + for (auto& [pc, loc] : m_targets) + { + target_count += loc.size(); + } + + const bool should_search_patterns = likely_putllc_loop && target_count < 100; + + // Treat start of function as an unknown value with tag (because it is) + const reg_state_t start_program_count = reg_state_t::make_unknown(); + + // Initialize + reg_state_it.emplace_back(std::make_unique(entry_point)); + run_on_block[entry_point / 4] = true; + + enum spu_addr_mask_t : u32 + { + SPU_LS_MASK_128 = (SPU_LS_SIZE - 1) & -128, + SPU_LS_MASK_16 = (SPU_LS_SIZE - 1) & -16, + SPU_LS_MASK_4 = (SPU_LS_SIZE - 1) & -4, + SPU_LS_MASK_1 = (SPU_LS_SIZE - 1), + }; + + for (u32 wf = 0, wi = 0, wa = entry_point, bpc = wa; wf <= 1;) + { + const bool is_form_block = wf == 0; + const bool is_pattern_match = wf == 1; + + dummy16.active = false; + + if (!is_form_block && wa == bpc) + { + if (!should_search_patterns) + { + // TODO: Enable constant search for all + break; + } + + if (wi == 0) + { + for (auto& [addr, block] : infos) + { + // Evaluate state for all blocks + block->evaluate_start_state(infos); + } + } + + true_state_walkby = &infos[bpc]->evaluate_start_state(infos); + + for (reg_state_t& f : *true_state_walkby) + { + if (f.flag & vf::is_null) + { + // Evaluate locally + //f.tag = reg_state_t::alloc_tag(); + f.flag -= vf::is_null; + } + } + } + + auto& vregs = is_form_block ? infos[bpc]->local_state : *true_state_walkby; + auto& atomic16 = is_pattern_match ? ::at32(reg_state_it, wi)->atomic16 : dummy16; + + const u32 pos = wa; + + wa += 4; + + const auto break_putllc16 = [&](u32 cause, atomic16_t previous) + { + if (previous.active && likely_putllc_loop && getllar_starts.contains(previous.lsa_pc)) + { + const bool is_first = !std::exchange(getllar_starts[previous.lsa_pc], true); + + if (!is_first) + { + return; + } + + g_fxo->get().breaking_reason[cause]++; + + if (!spu_log.notice) + { + return; + } + + std::string break_error = fmt::format("PUTLLC pattern breakage [%x cause=%u] (lsa_pc=0x%x)", pos, cause, previous.lsa_pc); + + const auto values = g_fxo->get().get_reasons(); + + std::string tracing = "Top Breaking Reasons:"; + + usz i = 0; + usz fail_count = 0; + bool switched_to_minimal = false; + + for (auto it = values.begin(); it != values.end(); i++, it++) + { + fail_count += it->second; + + if (i >= 12) + { + continue; + } + + if (i < 8 && it->second > 1) + { + fmt::append(tracing, " [cause=%u, n=%d]", it->first, it->second); + } + else + { + if (!std::exchange(switched_to_minimal, true)) + { + fmt::append(tracing, "; More:"); + } + + fmt::append(tracing, " %u", it->first); + } + } + + fmt::append(tracing, " of %d failures", fail_count); + spu_log.notice("%s\n%s", break_error, tracing); + + if (cause == 17 || cause == 20) + { + + } + } + }; + + const auto calculate_absolute_ls_difference = [](u32 addr1, u32 addr2) + { + addr1 &= SPU_LS_MASK_1; + addr2 &= SPU_LS_MASK_1; + + const u32 abs_diff = (addr1 >= addr2 ? addr1 - addr2 : addr2 - addr1); + + // Because memory is wrapping-around, take the gap that is smaller + return abs_diff >= SPU_LS_SIZE / 2 ? SPU_LS_SIZE - abs_diff : abs_diff; + }; + + const auto next_block = [&]() + { + // Reset value information + wi++; + + if (wf == 0) + { + auto& block = infos[bpc]; + + if (g_cfg.core.spu_block_size != spu_block_size_type::safe && (m_ret_info[bpc / 4] || m_entry_info[bpc / 4] || pos == entry_point)) + { + // Do not allow value passthrough + for (reg_state_t& f : block->start_reg_state) + { + f.flag -= vf::is_null; + } + + for (reg_state_t& f : block->local_state) + { + f.flag -= vf::is_null; + } + + // Block has an external origin, discard all previous information + block->end_reg_state = block->local_state; + block->has_true_state = true; + } + + block->addend_reg_state = block->local_state; + } + + if (wi >= reg_state_it.size()) + { + wf++; + wi = 0; + + if (wf == 1) + { + reg_state_it.clear(); + + if (!infos.empty()) + { + reg_state_it.emplace_back(std::make_unique(infos.begin()->second->pc)); + } + } + } + + if (wi < reg_state_it.size()) + { + wa = ::at32(reg_state_it, wi)->pc; + bpc = wa; + } + }; + + const auto get_reg = [&](u32 reg) -> const reg_state_t& + { + return vregs[reg]; + }; + + const auto move_reg = [&](u32 dst, u32 src) + { + if (dst == src || vregs[src] == vregs[dst]) + { + return; + } + + vregs[dst] = vregs[src]; + + // Register storage has changed + vregs[dst].flag -= vf::is_null; + }; + + const auto set_const_value = [&](u32 reg, u32 value) + { + vregs[reg] = reg_state_t::from_value(value); + }; + + const auto inherit_const_value = [&](u32 reg, bs_t flag, u32 value) + { + flag -= vf::is_null; + vregs[reg] = reg_state_t{flag, value, flag & vf::is_const ? u32{umax} : reg_state_t::alloc_tag()}; + }; + + const auto inherit_const_mask_value = [&](u32 reg, reg_state_t state, u32 mask_ones, u32 mask_zeroes) + { + if ((mask_ones | mask_zeroes) == 0) + { + state.flag -= vf::is_null; + vregs[reg] = state; + return; + } + + if (state.flag & vf::is_const) + { + vregs[reg] = reg_state_t::from_value((state.value | mask_ones) & ~mask_zeroes); + return; + } + + const u32 ones = (state.known_ones | mask_ones) & ~mask_zeroes; + const u32 zeroes = (state.known_zeroes | mask_zeroes) & ~mask_ones; + + if ((ones ^ zeroes) == umax) + { + // Special case: create a constant from full masks + vregs[reg] = reg_state_t::from_value(ones); + return; + } + + ensure(state.tag != umax); + vregs[reg] = reg_state_t{vf::is_mask, 0, state.tag, ones, zeroes}; + }; + + const auto unconst = [&](u32 reg) + { + vregs[reg] = {{}, {}, reg_state_t::alloc_tag()}; + }; + + const auto add_block = [&](u32 target) + { + // Validate new target (TODO) + if (target >= lsa && (target & -4) < limit) + { + if (!infos[target]) + { + infos[target] = block_reg_info::create(target); + } + + if (is_form_block) + { + block_reg_info::create_node(target, bpc, infos); + + if (!run_on_block[target / 4]) + { + reg_state_it.emplace_back(std::make_unique(target)); + run_on_block[target / 4] = true; + } + + return; + } + + // Check block duplication (terminating infinite loops) + // Even if duplicated, this still has impact by registering the end of the possible code path outcome + std::set positions; + + for (usz i = wi;;) + { + auto& entry = ::at32(reg_state_it, i); + AUDIT(positions.emplace(entry->pc).second); + + if (entry->pc == target) + { + return; + } + + const usz parent = entry->parent_iterator_index; + + if (parent == umax) + { + break; + } + + ensure(i != parent); + i = parent; + } + + auto& next = reg_state_it.emplace_back(std::make_unique(target, wi)); + next->atomic16 = ::at32(reg_state_it, wi)->atomic16; + } + }; + + if (pos < lsa || pos >= limit) + { + // Don't analyse if already beyond the limit + next_block(); + continue; + } + + if (bpc != pos && m_preds.count(pos)) + { + // End of block reached + next_block(); + continue; + } + + if (g_cfg.core.spu_block_size != spu_block_size_type::safe && (m_ret_info[pos / 4] || m_entry_info[pos / 4] || pos == entry_point)) + { + ensure(bpc == pos); + + // Block has an external origin, discard all previous information + // TODO: Make the optimizations conditional at runtime instead + if (!is_form_block) + { + // Call for external code + break_putllc16(25, atomic16.discard()); + } + } + + const u32 data = std::bit_cast>(::at32(result.data, (pos - lsa) / 4)); + const auto op = spu_opcode_t{data}; + const auto type = g_spu_itype.decode(data); + + // For debugging + if (false && likely_putllc_loop && is_pattern_match) + { + SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast(result.data.data()), result.lower_bound); + dis_asm.disasm(pos); + + std::string consts; + + for (auto _use : std::initializer_list>{{op.ra, m_use_ra.test(pos / 4)} + , {op.rb, m_use_rb.test(pos / 4)}, {op.rc, m_use_rc.test(pos / 4)}}) + { + if (!_use.second) + { + continue; + } + + if (!consts.empty()) + { + consts += ','; + } + + const u32 reg_file = _use.first; + const auto& reg = get_reg(reg_file); + if (reg.is_const()) + { + fmt::append(consts, " r%d=0x%x", reg_file, reg.value); + } + else + { + if (u32 mask = reg.known_zeroes | reg.known_ones) + { + fmt::append(consts, " r%d=#%d-&|0x%x", reg_file, reg.tag, mask); + } + else + { + fmt::append(consts, " r%d=#%d", reg_file, reg.tag); + } + } + } + + if (!consts.empty()) + { + consts = " {" + consts + " }"; + } + + if (dis_asm.last_opcode.ends_with('\n')) + { + dis_asm.last_opcode.pop_back(); + } + + spu_log.always()("[SPU=0%x, wi=%d] %s%s", pos, wi, dis_asm.last_opcode, consts); + } + + // Analyse instruction + switch (type) + { + case spu_itype::UNK: + case spu_itype::DFCEQ: + case spu_itype::DFCMEQ: + case spu_itype::DFCGT: + case spu_itype::DFCMGT: + case spu_itype::DFTSV: + { + // Stop before invalid instructions (TODO) + next_block(); + continue; + } + + case spu_itype::SYNC: + case spu_itype::STOP: + case spu_itype::STOPD: + { + if (data == 0) + { + // Stop before null data + next_block(); + continue; + } + + if (g_cfg.core.spu_block_size == spu_block_size_type::safe) + { + // Stop on special instructions (TODO) + next_block(); + break; + } + + if (type == spu_itype::SYNC) + { + // Remember + sync = true; + } + + break; + } + + case spu_itype::IRET: + { + next_block(); + break; + } + + case spu_itype::BI: + case spu_itype::BISL: + case spu_itype::BISLED: + case spu_itype::BIZ: + case spu_itype::BINZ: + case spu_itype::BIHZ: + case spu_itype::BIHNZ: + { + if (op.e) + { + break_putllc16(27, atomic16.discard()); + } + + break; + } + + case spu_itype::BRSL: + case spu_itype::BRASL: + { + break; + } + + case spu_itype::BRA: + { + break; + } + + case spu_itype::BR: + case spu_itype::BRZ: + case spu_itype::BRNZ: + case spu_itype::BRHZ: + case spu_itype::BRHNZ: + { + break; + } + + case spu_itype::DSYNC: + case spu_itype::HEQ: + case spu_itype::HEQI: + case spu_itype::HGT: + case spu_itype::HGTI: + case spu_itype::HLGT: + case spu_itype::HLGTI: + case spu_itype::LNOP: + case spu_itype::NOP: + case spu_itype::MTSPR: + case spu_itype::FSCRWR: + { + // Do nothing + break; + } + + case spu_itype::WRCH: + { + switch (op.ra) + { + case MFC_EAL: + { + move_reg(s_reg_mfc_eal, op.rt); + break; + } + case MFC_LSA: + { + auto rt = get_reg(op.rt); + inherit_const_mask_value(s_reg_mfc_lsa, rt, 0, ~SPU_LS_MASK_1); + + if (is_pattern_match) + { + atomic16.lsa_last_pc = pos; + } + + break; + } + case MFC_TagID: + { + break; + } + case MFC_Size: + { + break; + } + case MFC_Cmd: + { + const auto [af, av, atagg, _3, _5] = get_reg(op.rt); + + if (!is_pattern_match) + { + // + } + else if (af & vf::is_const) + { + switch (av) + { + case MFC_GETLLAR_CMD: + { + break_putllc16(30, atomic16.discard()); + + // If LSA write has not happened, use block start + atomic16.lsa_pc = atomic16.lsa_last_pc == SPU_LS_SIZE ? bpc : atomic16.lsa_last_pc; + atomic16.active = true; + + const auto lsa = get_reg(s_reg_mfc_lsa); + inherit_const_mask_value(s_reg_mfc_lsa, lsa, 0, ~SPU_LS_MASK_128); + atomic16.lsa = get_reg(s_reg_mfc_lsa); + + if (likely_putllc_loop) + { + // Register loop entry + if (getllar_starts.emplace(atomic16.lsa_pc, false).second) + { + g_fxo->get().all++; + spu_log.notice("[0x%05x] GETLLAR pattern entry point", pos); + } + } + + break; + } + case MFC_PUTLLC_CMD: + { + if (atomic16.active) + { + const auto _lsa = get_reg(s_reg_mfc_lsa); + + // Search the value of LS address stoire/load in latest register file + if (atomic16.ls_access && atomic16.ls_write && !atomic16.ls_pc_rel && !atomic16.ls.is_const()) + { + usz reg_it = umax; + u32 regs[2]{s_reg_max, s_reg_max}; + + for (auto val : {&atomic16.ls, &atomic16.ls_offs}) + { + reg_it++; + + if (val->is_const()) + { + regs[reg_it] = 0; + continue; + } + + if (vregs[s_reg_mfc_lsa].compare_with_mask_indifference(*val, SPU_LS_MASK_1)) + { + regs[reg_it] = s_reg_mfc_lsa; + continue; + } + + for (u32 i = 0; i <= s_reg_127; i++) + { + const auto& _reg = vregs[i]; + + if (_reg == *val) + { + regs[reg_it] = i; + break; + } + } + } + + if (regs[0] == s_reg_max || regs[1] == s_reg_max) + { + break_putllc16(3, atomic16.discard()); + break; + } + + atomic16.reg = regs[0]; + + if (!atomic16.ls_offs.is_const()) + { + atomic16.reg2 = regs[1]; + } + } + + if (atomic16.ls_access && atomic16.ls_write && !atomic16.lsa.compare_with_mask_indifference(_lsa, SPU_LS_MASK_128)) + { + // LSA latest value mismatches with the one written with GETLLAR + + if (atomic16.lsa.flag != _lsa.flag) + { + break_putllc16(1, atomic16.discard()); + } + else + { + break_putllc16(2, atomic16.discard()); + } + + break; + } + + if (atomic16.ls_access && atomic16.ls_write) + { + atomic16.select_16_or_0_at_runtime = false; + + bool ok = false; + + if (atomic16.ls_pc_rel || !atomic16.ls_offs.is_const()) + { + // + } + else if (atomic16.lsa.is_const()) + { + if (atomic16.ls.is_const()) + { + if (atomic16.ls_offs != 0) + { + // Rebase constant so we can get rid of ls_offs + atomic16.ls.value = spu_ls_target(atomic16.ls_offs.value + atomic16.ls.value); + atomic16.ls_offs = reg_state_t::from_value(0); + } + + if (atomic16.ls.value >= (atomic16.lsa.value & -128) && atomic16.ls.value < utils::align(atomic16.lsa.value + 1, 128)) + { + ok = true; + } + } + else if (atomic16.ls_offs.value >= (atomic16.lsa.value & -128) && atomic16.ls_offs.value < utils::align(atomic16.lsa.value + 1, 128) && atomic16.ls.is_less_than(128 - (atomic16.lsa.value & 127))) + { + ok = true; + } + } + else if (!atomic16.lsa.is_const() && atomic16.lsa.compare_with_mask_indifference(atomic16.ls, SPU_LS_MASK_128) && atomic16.ls_offs == 0) + { + // Unknown value with known offset of less than 128 bytes + ok = true; + } + + if (!ok) + { + // This is quite common.. let's try to select between putllc16 and putllc0 at runtime! + // break_putllc16(100); + // atomic16.discard(); + // break; + atomic16.select_16_or_0_at_runtime = true; + } + } + + if (!atomic16.get_rdatomic) + { + // MFC_RdAtomicStat must have been read, otherwise GETLLAR may not be executed (according to HW tests) + break_putllc16(21, atomic16.discard()); + } + + atomic16.put_pc = pos; + atomic16.put_active = true; + } + + break; + } + default: + { + break_putllc16(4, atomic16.discard()); + break; + } + } + } + else + { + break_putllc16(5, atomic16.discard()); + } + + if (!atomic16.active) + { + // Propagate failure + for (auto& atm : atomic16_all) + { + if (atm.second.active && atm.second.put_pc == pos) + { + break_putllc16(31, atm.second.discard()); + } + } + } + + break; + } + case MFC_EAH: + case SPU_WrDec: + case SPU_WrSRR0: + case SPU_WrEventAck: + case SPU_Set_Bkmk_Tag: + case SPU_PM_Start_Ev: + case SPU_PM_Stop_Ev: + case MFC_WrTagMask: + //case MFC_WrTagUpdate: // Technically correct to ignore but risky + break; + default: + { + break_putllc16(6, atomic16.discard()); + break; + } + } + + break; + } + + case spu_itype::RCHCNT: + case spu_itype::RDCH: + { + const bool is_read = type == spu_itype::RDCH; + bool invalidate = is_read; + + switch (op.ra) + { + case MFC_RdAtomicStat: + { + if (!is_read) + { + break; + } + + if (atomic16.active) + { + if (atomic16.put_active) + { + if (getllar_starts.contains(atomic16.lsa_pc) && getllar_starts[atomic16.lsa_pc]) + { + break_putllc16(24, atomic16.discard()); + break; + } + + const auto it = atomic16_all.find(pos); + + if (it == atomic16_all.end()) + { + // Fresh new pattern detected in a single code path + atomic16_all.emplace(pos, atomic16); + } + else if (it->second.active) + { + // Merge pattern attributes between different code paths, may cause detection of failures + atomic16_t& existing = it->second; + + if (existing.lsa_pc != atomic16.lsa_pc || existing.put_pc != atomic16.put_pc || existing.lsa != atomic16.lsa) + { + // Register twice + break_putllc16(22, atomic16.discard()); + break_putllc16(22, existing.discard()); + } + + if (existing.active && existing.ls_access && atomic16.ls_access && (!existing.ls.compare_with_mask_indifference(atomic16.ls, SPU_LS_MASK_1) || existing.ls_offs != atomic16.ls_offs)) + { + // Conflicting loads with stores in more than one code path + break_putllc16(27, atomic16.set_invalid_ls(existing.ls_write || atomic16.ls_write)); + + if (!atomic16.active) + { + existing.active = false; + } + } + + if (existing.active && (existing.ls_write || atomic16.ls_write) && (existing.ls_invalid || atomic16.ls_invalid)) + { + // Conflicting loads with stores in more than one code path + break_putllc16(33, atomic16.discard()); + existing.active = false; + existing.ls_invalid = true; + } + + if (existing.active && !existing.ls_access && atomic16.ls_access) + { + // Propagate LS access + existing.ls = atomic16.ls; + existing.ls_offs = atomic16.ls_offs; + } + + existing.ls_write |= atomic16.ls_write; + existing.ls_invalid |= atomic16.ls_invalid; + existing.ls_access |= atomic16.ls_access; + existing.mem_count = std::max(existing.mem_count, atomic16.mem_count); + existing.select_16_or_0_at_runtime |= atomic16.select_16_or_0_at_runtime; + } + + atomic16.discard(); + } + else if (!atomic16.get_rdatomic) + { + atomic16.get_rdatomic = true; + + // Go above and beyond and also set the constant for it + set_const_value(op.rt, MFC_GETLLAR_SUCCESS); + invalidate = false; + } + } + + break; + } + // Let's be safe here and no not allow multi-threaded communications + case SPU_WrOutMbox: + case SPU_WrOutIntrMbox: + case SPU_RdSigNotify1: + case SPU_RdSigNotify2: + case SPU_RdInMbox: + //case SPU_RdEventStat: + { + if (is_read) + { + break_putllc16(28, atomic16.discard()); + } + else + { + break_putllc16(29, atomic16.discard()); + } + + break; + } + default: + { + break; + } + } + + if (invalidate) + { + unconst(op.rt); + } + + break; + } + case spu_itype::STQR: + case spu_itype::LQR: + { + const bool is_store = type == spu_itype::STQR; + + if (atomic16.active) + { + atomic16.mem_count++; + + // Do not clear lower 16 bytes addressing because the program can move on 4-byte basis + const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16); + + if (atomic16.lsa.is_const() && [&]() + { + bool hack = false; + + if (offs % 16 == 0 && (pos - result.lower_bound + op.si16 * 4) == offs) + { + const u32 reservation_bound = utils::align(atomic16.lsa.value + 1, 128); + const u32 min_offs = offs; + + // Hack: assume there is no overflow in relative instruction offset + // Thus, use instruction position + offset as a lower bound for reservation access + if (min_offs >= reservation_bound) + { + spu_log.success("STQR/LQR Atomic Loop Hack: abs_pos=0x%x, abs=0x%x, i16*4=0x%x, ls_bound=0x%x", offs, pos + op.si16 * 4, op.si16 * 4, reservation_bound); + hack = true; + } + } + + return hack; + }()) + { + // Ignore memory access in this case + } + else if (atomic16.ls_invalid && is_store) + { + break_putllc16(35, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls != start_program_count) + { + break_putllc16(7, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && offs != atomic16.ls_offs) + { + if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1)) + { + atomic16.ls_write |= is_store; + } + else + { + // Sad + break_putllc16(8, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = start_program_count; + atomic16.ls_offs = reg_state_t::from_value(offs); + atomic16.ls_pc_rel = true; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + // atomic16.ls_reg[offs % 128 / 16] = start_program_count; + // atomic16.ls_offs[offs % 128 / 16] = offs; + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::STQX: + case spu_itype::LQX: + { + const bool is_store = type == spu_itype::STQX; + + if (atomic16.active) + { + atomic16.mem_count++; + + auto ra = get_reg(op.ra); + ra.value &= SPU_LS_MASK_1; + auto rb = get_reg(op.rb); + rb.value &= SPU_LS_MASK_1; + + const u32 const_flags = u32{ra.is_const()} + u32{rb.is_const()}; + + switch (const_flags) + { + case 2: + { + auto add_res = ra; + add_res.value += rb.value; + add_res.value &= SPU_LS_MASK_16; + add_res.tag = umax; + + if (atomic16.lsa.unequal_with_mask_indifference(add_res, SPU_LS_MASK_128)) + { + // Unrelated, ignore + } + else if (atomic16.ls_invalid && is_store) + { + break_putllc16(20, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && add_res != atomic16.ls) + { + if (atomic16.ls.unequal_with_mask_indifference(add_res, SPU_LS_MASK_128) && atomic16.ls_offs == 0) + { + // Ok + } + else if (atomic16.ls_pc_rel) + { + break_putllc16(8, atomic16.set_invalid_ls(is_store)); + } + else + { + // Sad + break_putllc16(9, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = reg_state_t::from_value(add_res.value); + atomic16.ls_offs = reg_state_t::from_value(0); + atomic16.ls_pc_rel = false; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + break; + } + case 1: + { + const auto& state = ra.is_const() ? rb : ra; + const auto& _lsa = atomic16.lsa; + const u32 offs = (ra.is_const() ? ra.value : rb.value) & SPU_LS_MASK_1; + const u32 abs_diff = calculate_absolute_ls_difference(offs, 0); + + if ((_lsa.unequal_with_mask_indifference(state, SPU_LS_MASK_128) && offs == 0) || + (_lsa.compare_with_mask_indifference(state, SPU_LS_MASK_1) && abs_diff >= 128u) || + (_lsa.compare_with_mask_indifference(state, SPU_LS_MASK_128) && abs_diff >= 256u) + ) + { + // We already know it's an unrelated load/store + // The reason for SPU_LS_SIZE - 128 check is that in case LSA is not aligned, it detects the possible wraparound + } + else if (atomic16.ls_invalid && is_store) + { + break_putllc16(23, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls != state) + { + if (atomic16.ls.unequal_with_mask_indifference(state, SPU_LS_MASK_128) && offs == 0) + { + // Ok + } + else if (atomic16.ls_pc_rel) + { + break_putllc16(36, atomic16.set_invalid_ls(is_store)); + } + else + { + // Sad + break_putllc16(11, atomic16.set_invalid_ls(is_store)); + } + } + else if (atomic16.ls_access) + { + ensure(!atomic16.ls.is_const()); + + if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1)) + { + // Ok + atomic16.ls_write |= is_store; + } + else if (atomic16.ls_offs.is_const() && atomic16.ls_offs.value / 16 == offs / 16 && state.get_known_zeroes() % 16 >= std::max(offs % 16, atomic16.ls_offs.value % 16)) + { + // For special case observed in games (offset cannot cause the address to roll over the next 16 bytes) + atomic16.ls_write |= is_store; + } + else + { + break_putllc16(12, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = state; + atomic16.ls_offs = reg_state_t::from_value(offs); + atomic16.ls_pc_rel = false; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + break; + } + case 0: + { + const bool is_ra_first = atomic16.ls_access ? ra == atomic16.ls : op.ra <= op.rb; + + const auto& state1 = is_ra_first ? ra : rb; + const auto& state2 = is_ra_first ? rb : ra; + + if (atomic16.ls_access && (atomic16.ls != state1 || atomic16.ls_offs != state2)) + { + if (atomic16.ls_pc_rel) + { + break_putllc16(32, atomic16.set_invalid_ls(is_store)); + } + else + { + // Sad + break_putllc16(13, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = state1; + atomic16.ls_offs = state2; + atomic16.ls_pc_rel = false; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + + break; + } + default: fmt::throw_exception("Unreachable!"); + } + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + case spu_itype::STQA: + case spu_itype::LQA: + { + const bool is_store = type == spu_itype::STQA; + + if (atomic16.active) + { + atomic16.mem_count++; + + const reg_state_t ca = reg_state_t::from_value(spu_ls_target(0, op.i16)); + + if (atomic16.lsa.unequal_with_mask_indifference(ca, SPU_LS_MASK_128)) + { + // We already know it's an unrelated load/store + } + else if (atomic16.ls_invalid && is_store) + { + break_putllc16(37, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && ca != atomic16.ls) + { + if (atomic16.ls.unequal_with_mask_indifference(ca, SPU_LS_MASK_128) && atomic16.ls_offs == 0) + { + // Ok + } + else if (atomic16.ls_pc_rel) + { + break_putllc16(14, atomic16.set_invalid_ls(is_store)); + } + else + { + // Sad + break_putllc16(15, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = ca; + atomic16.ls_offs = reg_state_t::from_value(0); + atomic16.ls_pc_rel = false; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + } + + if (is_store) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::STQD: + case spu_itype::LQD: + { + const bool is_store = type == spu_itype::STQD; + + if (atomic16.active) + { + atomic16.mem_count++; + + auto ra = get_reg(op.ra); + const auto& _lsa = atomic16.lsa; + + ra.value = ra.is_const() ? spu_ls_target(ra.value, op.si10 * 4) : 0; + const u32 offs = ra.is_const() ? 0 : spu_ls_target(0, op.si10 * 4); + const u32 abs_diff = calculate_absolute_ls_difference(offs, 0); + const u32 const_flags = u32{ra.is_const()} + u32{atomic16.ls.is_const()}; + const u32 const_lsa_flags = u32{ra.is_const()} + u32{_lsa.is_const()}; + + if ((_lsa.unequal_with_mask_indifference(ra, SPU_LS_MASK_128) && offs == 0) || + (_lsa.compare_with_mask_indifference(ra, SPU_LS_MASK_1) && abs_diff >= 128u) || + (_lsa.compare_with_mask_indifference(ra, SPU_LS_MASK_128) && abs_diff >= 256u) + ) + { + // We already know it's an unrelated load/store + // The reason for SPU_LS_SIZE - 128 check is that in case LSA is not aligned, it detects the possible wraparound + } + else if (atomic16.ls_invalid && is_store) + { + break_putllc16(34, atomic16.set_invalid_ls(is_store)); + } + else if (atomic16.ls_access && atomic16.ls != ra) + { + if (atomic16.ls.unequal_with_mask_indifference(ra, SPU_LS_MASK_128) && (offs == 0 && atomic16.ls_offs == 0)) + { + // Ok + } + else if (atomic16.ls_pc_rel) + { + break_putllc16(16, atomic16.set_invalid_ls(is_store)); + } + else + { + // Sad + break_putllc16(17, atomic16.set_invalid_ls(is_store)); + } + } + else if (atomic16.ls_access) + { + if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1)) + { + atomic16.ls_write |= is_store; + } + else if (atomic16.ls_offs.is_const() && atomic16.ls_offs.value / 16 == offs / 16 && ra.get_known_zeroes() % 16 >= std::max(offs % 16, atomic16.ls_offs.value % 16)) + { + // For special case observed in games (offset cannot cause the address to roll over the next 16 bytes) + atomic16.ls_write |= is_store; + } + else + { + break_putllc16(18, atomic16.set_invalid_ls(is_store)); + } + } + else + { + atomic16.ls = ra; + atomic16.ls_offs = reg_state_t::from_value(offs); + atomic16.ls_pc_rel = false; + atomic16.ls_write |= is_store; + atomic16.ls_access = true; + } + } + + if (type == spu_itype::STQD) + { + break; + } + + // Unconst + unconst(op.rt); + break; + } + + case spu_itype::HBR: + { + hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt); + const auto [af, av, at, ao, az] = get_reg(op.ra); + hbr_tg = af & vf::is_const && !op.c ? av & 0x3fffc : -1; + break; + } + + case spu_itype::HBRA: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(0x0, op.i16); + break; + } + + case spu_itype::HBRR: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(pos, op.i16); + break; + } + + case spu_itype::IL: + { + set_const_value(op.rt, op.si16); + break; + } + case spu_itype::ILA: + { + set_const_value(op.rt, op.i18); + break; + } + case spu_itype::ILH: + { + set_const_value(op.rt, op.i16 << 16 | op.i16); + break; + } + case spu_itype::ILHU: + { + set_const_value(op.rt, op.i16 << 16); + break; + } + case spu_itype::IOHL: + { + const auto rt = get_reg(op.rt); + inherit_const_mask_value(op.rt, rt, op.i16, 0); + break; + } + case spu_itype::ORI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto ra = get_reg(op.ra); + inherit_const_mask_value(op.rt, ra, op.si10, 0); + break; + } + case spu_itype::OR: + { + if (op.ra == op.rb) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv | av); + break; + } + case spu_itype::XORI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av ^ op.si10); + break; + } + case spu_itype::XOR: + { + if (op.ra == op.rb) + { + set_const_value(op.rt, 0); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv ^ av); + break; + } + case spu_itype::NOR: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, ~(bv | av)); + break; + } + case spu_itype::ANDI: + { + const auto ra = get_reg(op.ra); + inherit_const_mask_value(op.rt, ra, 0, ~op.si10); + break; + } + case spu_itype::AND: + { + if (op.ra == op.rb) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + const auto [bf, bv, _2, _4, _6] = get_reg(op.rb); + inherit_const_value(op.rt, af & bf, bv & av); + break; + } + case spu_itype::AI: + { + if (!op.si10) + { + move_reg(op.rt, op.ra); + break; + } + + const auto ra = get_reg(op.ra); + const auto [af, av, at, ao, az] = ra; + + inherit_const_value(op.rt, af, av + op.si10); + + if (u32 mask = ra.get_known_zeroes() & ~op.si10; mask & 1) + { + // Added zeroes are always zeroes which comes in handy later + inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1); + } + + break; + } + case spu_itype::A: + { + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az] = ra; + const auto [bf, bv, bt, bo, bz] = rb; + + inherit_const_value(op.rt, af & bf, bv + av); + + if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1) + { + // Added zeroes are always zeroes which comes in handy later + inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1); + } + + break; + } + case spu_itype::SFI: + { + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, op.si10 - av); + break; + } + case spu_itype::SF: + { + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + const auto [af, av, at, ao, az] = ra; + const auto [bf, bv, bt, bo, bz] = rb; + + inherit_const_value(op.rt, af & bf, bv - av); + + if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1) + { + // Subtracted zeroes are always zeroes which comes in handy later + inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1); + } + + break; + } + case spu_itype::FSMBI: + { + const u32 mask = (op.i16 >> 12); + + const u32 value = (mask & 1 ? 0xff : 0) | + (mask & 2 ? 0xff00 : 0) | + (mask & 4 ? 0xff0000 : 0) | + (mask & 8 ? 0xff000000u : 0); + + set_const_value(op.rt, value); + break; + } + case spu_itype::ROTMI: + { + if ((0 - op.i7) & 0x20) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av >> ((0 - op.i7) & 0x1f)); + break; + } + case spu_itype::SHLI: + { + if (op.i7 & 0x20) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + const auto [af, av, at, ao, az] = get_reg(op.ra); + inherit_const_value(op.rt, af, av << (op.i7 & 0x1f)); + break; + } + case spu_itype::SELB: + { + const auto ra = get_reg(op.ra); + const auto rb = get_reg(op.rb); + + // Ignore RC, perform a value merge which also respect bitwise information + vregs[op.rt4] = ra.merge(rb); + break; + } + case spu_itype::SHLQBYI: + { + if (op.i7 & 0x10) + { + set_const_value(op.rt, 0); + break; + } + + if (!op.i7) + { + move_reg(op.rt, op.ra); + break; + } + + [[fallthrough]]; + } + default: + { + // Make unknown value + if (!(type & spu_itype::zregmod)) + { + const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; + unconst(op_rt); + } + + break; + } + } + + if (m_targets.count(pos)) + { + for (u32 next_target : ::at32(m_targets, pos)) + { + add_block(next_target); + } + + next_block(); + } + } + // Fill block info for (auto& pred : m_preds) { @@ -4414,6 +6382,96 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s } } + std::string func_hash; + if (!result.data.empty()) + { + sha1_context ctx; + u8 output[20]{}; + + sha1_starts(&ctx); + sha1_update(&ctx, reinterpret_cast(result.data.data()), result.data.size() * 4); + sha1_finish(&ctx, output); + fmt::append(func_hash, "%s", fmt::base57(output)); + } + + for (const auto& [pc_commited, pattern] : atomic16_all) + { + if (!pattern.active) + { + continue; + } + + if (getllar_starts.contains(pattern.lsa_pc) && getllar_starts[pattern.lsa_pc]) + { + continue; + } + + auto& stats = g_fxo->get(); + + if (!pattern.ls_write) + { + spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all); + add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa); + continue; + } + + ensure(!pattern.ls_invalid); + + union putllc16_info + { + u32 data; + bf_t type; + bf_t runtime16_select; + bf_t reg; + bf_t off18; + bf_t reg2; + } value{}; + + enum : u32 + { + v_const = 0, + v_relative = 1, + v_reg_offs = 2, + v_reg2 = 3, + }; + + value.runtime16_select = pattern.select_16_or_0_at_runtime; + value.reg = s_reg_max; + + if (pattern.ls.is_const()) + { + ensure(pattern.reg == s_reg_max && pattern.reg2 == s_reg_max && pattern.ls_offs.is_const(), "Unexpected register usage"); + value.type = v_const; + value.off18 = pattern.ls.value & SPU_LS_MASK_1; + } + else if (pattern.ls == start_program_count) + { + ensure(pattern.ls_offs.is_const(), "Unexpected register2 usage"); + value.type = v_relative; + value.off18 = pattern.ls_offs.value & SPU_LS_MASK_1; + } + else if (pattern.ls_offs.is_const()) + { + ensure(pattern.reg != s_reg_max, "Not found register usage"); + value.type = v_reg_offs; + value.reg = pattern.reg; + value.off18 = pattern.ls_offs.value; + } + else + { + ensure(pattern.reg != s_reg_max, "Not found register usage"); + ensure(pattern.reg2 != s_reg_max, "Not found register2 usage"); + value.type = v_reg2; + value.reg = pattern.reg; + value.reg2 = pattern.reg2; + } + + add_pattern(false, inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data); + + spu_log.success("PUTLLC16 Pattern Detected! (mem_count=%d, put_pc=0x%x, pc_rel=%d, offset=0x%x, const=%u, two_regs=%d, reg=%u, runtime=%d, 0x%x-%s) (putllc0=%d, putllc16+0=%d, all=%d)" + , pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, +stats.nowrite, ++stats.single, +stats.all); + } + if (result.data.empty()) { // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback @@ -5212,6 +7270,275 @@ std::unique_ptr spu_recompiler_base::make_fast_llvm_recompi return std::make_unique(); } +std::array& block_reg_info::evaluate_start_state(const std::map>& map) +{ + if (!has_true_state) + { + std::basic_string been_there; + + struct iterator_info + { + u32 block_pc = SPU_LS_SIZE; + + struct state_t + { + u32 block_pc = SPU_LS_SIZE; + std::array reg_state; + bool disconnected = false; + bool state_written = false; + }; + + std::vector state_prev; + usz completed = 0; + usz parent_iterator_index = umax; + }; + + std::vector info_queue; + + iterator_info first_entry{pc, {}, 0, umax}; + info_queue.emplace_back(std::move(first_entry)); + + // info_queue may grow + for (usz qi = 0; qi < info_queue.size();) + { + const auto it = std::addressof(info_queue[qi]); + ensure(qi == info_queue.size() - 1); + + auto& cur_node = ::at32(map, it->block_pc); + + ensure(it->parent_iterator_index == qi - 1); + + if (cur_node->has_true_state) + { + // Evaluted somewhen before + if (qi != 0) + { + ensure(!been_there.empty()); + been_there.pop_back(); + info_queue.pop_back(); + qi--; + continue; + } + else + { + break; + } + } + + if (it->state_prev.empty()) + { + // Build the list here to avoid code duplication + const usz real_size = cur_node->prev_nodes.size(); + + if (real_size) + { + it->state_prev.resize(real_size); + + for (usz i = 0; i < real_size; i++) + { + it->state_prev[i].block_pc = cur_node->prev_nodes[i].prev_pc; + } + } + } + + const usz next_entry_idx = it->completed; + + if (next_entry_idx == it->state_prev.size()) + { + // Result merge from all predecessors + + // Flag to mark the state as resolved + bool is_all_resolved = true; + bool has_past_state = false; + + for (usz bi = 0; bi < it->state_prev.size(); bi++) + { + if (it->state_prev[bi].disconnected) + { + is_all_resolved = false; + continue; + } + + has_past_state = true; + + const u32 node_pc = it->state_prev[bi].block_pc; + const auto& node = ::at32(map, node_pc); + + // Check if the node is resolved + if (!node->has_true_state) + { + // Assume this block cannot be resolved at the moment + is_all_resolved = false; + break; + } + } + + std::array temp; + + if (qi == 0) + { + // TODO: First block is always resolved here, but this logic can be improved to detect more cases of opportunistic resolving + is_all_resolved = true; + } + + auto& res_state = is_all_resolved ? cur_node->start_reg_state : temp; + + for (usz bi = 0; bi < it->state_prev.size(); bi++) + { + if (it->state_prev[bi].disconnected) + { + // Loop state, even if not ignored for a million times the result would still be the same + // So ignore it + continue; + } + + std::array* arg_state{}; + const auto& node = ::at32(map, it->state_prev[bi].block_pc); + + if (node->has_true_state) + { + // State is resolved, use the entry's state + arg_state = std::addressof(node->end_reg_state); + } + else + { + // Use accumulated state from one path of code history + arg_state = std::addressof(it->state_prev[bi].reg_state); + ensure(it->state_prev[bi].state_written); + } + + if (bi == 0) + { + res_state = *arg_state; + } + else + { + merge(res_state, res_state, *arg_state); + } + } + + std::array* result_storage{}; + + if (is_all_resolved) + { + // Complete state of this block + result_storage = std::addressof(cur_node->end_reg_state); + cur_node->has_true_state = true; + } + else + { + // Patch incomplete state into saved state entry of parent block + ensure(it->parent_iterator_index != qi); + ensure(it->parent_iterator_index != umax); + + auto& state = ::at32(info_queue, it->parent_iterator_index).state_prev; + + for (usz i = 0;; i++) + { + ensure(i < state.size()); + + if (state[i].block_pc == it->block_pc) + { + result_storage = std::addressof(state[i].reg_state); + state[i].state_written = true; + break; + } + } + } + + // Stack the newer state on top of the old (if exists) + if (has_past_state) + { + build_on_top_of(*result_storage, cur_node->addend_reg_state, res_state); + } + else + { + *result_storage = cur_node->addend_reg_state; + } + + if (qi != 0) + { + ensure(!been_there.empty()); + been_there.pop_back(); + info_queue.pop_back(); + qi--; + } + else + { + ensure(cur_node->has_true_state); + break; + } + } + else + { + const u32 prev_pc = cur_node->prev_nodes[it->completed++].prev_pc; + const auto& prev_node = ::at32(map, prev_pc); + + // Queue for resolving if needed + if (!prev_node->has_true_state) + { + const bool loop_detected = been_there.find_first_of(prev_pc) != umax; + const bool avoid_extensive_analysis = qi >= 500; + + if (!loop_detected && !avoid_extensive_analysis) + { + info_queue.emplace_back(iterator_info{prev_pc, {}, 0, qi}); + been_there.push_back(prev_pc); + qi++; + } + else + { + auto& state = it->state_prev; + + for (usz i = 0;; i++) + { + ensure(i < state.size()); + + if (state[i].block_pc == prev_pc) + { + // Loop state, even if not ignored for a million times the result would be the same + // This is similar to multiplying zero a million times + // This is true at least for now, that any register difference is considered an unknown state change + // So ignore it + state[i].disconnected = true; + break; + } + } + + // Repeat + // qi += 0; + } + } + else + { + // Repeat + // qi += 0; + } + } + } + + ensure(has_true_state); + } + + walkby_state = start_reg_state; + return walkby_state; +} + +void spu_recompiler_base::add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end) +{ + if (end == umax) + { + end = start; + } + + m_patterns[start] = pattern_info{utils::address_range::start_end(start, end)}; + + for (u32 i = start; i <= (fill_all ? end : start); i += 4) + { + m_inst_attrs[i / 4] = attr; + } +} + extern std::string format_spu_func_info(u32 addr, cpu_thread* spu) { spu_thread* _spu = static_cast(spu); diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index af44d05a21..b11f439fda 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5,6 +5,7 @@ #include "Emu/system_config.h" #include "Emu/IdManager.h" #include "Emu/Cell/timers.hpp" +#include "Emu/Memory/vm_reservation.h" #include "Crypto/sha1.h" #include "Utilities/JIT.h" @@ -535,6 +536,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return m_ir->CreateGEP(get_type(), base, m_ir->getInt64(offset)); } + template + llvm::Value* _ptr(llvm::Value* base, llvm::Value* offset) + { + const auto off = m_ir->CreateGEP(get_type(), base, offset); + const auto ptr = m_ir->CreateBitCast(off, get_type()); + return ptr; + } + template llvm::Value* spu_ptr(Args... offset_args) { @@ -1079,6 +1088,273 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_body); } + void putllc16_pattern(const spu_program& prog, utils::address_range range) + { + // Prevent store elimination + m_block->store_context_ctr[s_reg_mfc_eal]++; + m_block->store_context_ctr[s_reg_mfc_lsa]++; + m_block->store_context_ctr[s_reg_mfc_tag]++; + m_block->store_context_ctr[s_reg_mfc_size]++; + + static const auto on_fail = [](spu_thread* _spu, u32 addr) + { + if (const u32 raddr = _spu->raddr) + { + // Last check for event before we clear the reservation + if (~_spu->ch_events.load().events & SPU_EVENT_LR) + { + if (raddr == addr) + { + _spu->set_events(SPU_EVENT_LR); + } + else + { + _spu->get_events(SPU_EVENT_LR); + } + } + + _spu->raddr = 0; + } + }; + + const union putllc16_info + { + u32 data; + bf_t type; + bf_t runtime16_select; + bf_t reg; + bf_t off18; + bf_t reg2; + } info = std::bit_cast(range.end); + + enum : u32 + { + v_const = 0, + v_relative = 1, + v_reg_offs = 2, + v_reg2 = 3, + }; + + const auto _raddr_match = llvm::BasicBlock::Create(m_context, "__raddr_match", m_function); + const auto _lock_success = llvm::BasicBlock::Create(m_context, "__putllc16_lock", m_function); + const auto _begin_op = llvm::BasicBlock::Create(m_context, "__putllc16_begin", m_function); + const auto _repeat_lock = llvm::BasicBlock::Create(m_context, "__putllc16_repeat", m_function); + const auto _repeat_lock_fail = llvm::BasicBlock::Create(m_context, "__putllc16_lock_fail", m_function); + const auto _success = llvm::BasicBlock::Create(m_context, "__putllc16_success", m_function); + const auto _inc_res = llvm::BasicBlock::Create(m_context, "__putllc16_inc_resv", m_function); + const auto _inc_res_unlocked = llvm::BasicBlock::Create(m_context, "__putllc16_inc_resv_unlocked", m_function); + const auto _success_and_unlock = llvm::BasicBlock::Create(m_context, "__putllc16_succ_unlock", m_function); + const auto _fail = llvm::BasicBlock::Create(m_context, "__putllc16_fail", m_function); + const auto _fail_and_unlock = llvm::BasicBlock::Create(m_context, "__putllc16_unlock", m_function); + const auto _final = llvm::BasicBlock::Create(m_context, "__putllc16_final", m_function); + + const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); + const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr)); + + m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _raddr_match, _fail, m_md_likely); + m_ir->SetInsertPoint(_raddr_match); + + value_t eal_val; + eal_val.value = _eal; + + auto get_reg32 = [&](u32 reg) + { + if (get_reg_type(reg) != get_type()) + { + return get_reg_fixed(reg, get_type()); + } + + return extract(get_reg_fixed(reg), 3).eval(m_ir); + }; + + const auto _lsa = (get_reg_fixed(s_reg_mfc_lsa) & 0x3ff80).eval(m_ir); + + llvm::Value* dest{}; + + if (info.type == v_const) + { + dest = m_ir->getInt32(info.off18); + } + else if (info.type == v_relative) + { + dest = m_ir->CreateAnd(get_pc(spu_branch_target(info.off18 + m_base)), 0x3fff0); + } + else if (info.type == v_reg_offs) + { + dest = m_ir->CreateAnd(m_ir->CreateAdd(get_reg32(info.reg), m_ir->getInt32(info.off18)), 0x3fff0); + } + else + { + dest = m_ir->CreateAnd(m_ir->CreateAdd(get_reg32(info.reg), get_reg32(info.reg2)), 0x3fff0); + } + + const auto diff = m_ir->CreateZExt(m_ir->CreateSub(dest, _lsa), get_type()); + + const auto _new = m_ir->CreateAlignedLoad(get_type(), _ptr(m_lsptr, dest), llvm::MaybeAlign{16}); + const auto _rdata = m_ir->CreateAlignedLoad(get_type(), _ptr(spu_ptr(&spu_thread::rdata), m_ir->CreateAnd(diff, 0x7f)), llvm::MaybeAlign{16}); + + const bool is_accurate_op = false && !!g_cfg.core.spu_accurate_reservations; + + const auto compare_data_change_res = is_accurate_op ? m_ir->getTrue() : m_ir->CreateICmpNE(_new, _rdata); + + if (info.runtime16_select) + { + m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpULT(diff, m_ir->getInt64(128)), compare_data_change_res), _begin_op, _inc_res, m_md_likely); + } + else + { + m_ir->CreateCondBr(compare_data_change_res, _begin_op, _inc_res, m_md_unlikely); + } + + m_ir->SetInsertPoint(_begin_op); + + // Touch memory (on the opposite side of the page) + m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, _ptr(m_memptr, m_ir->CreateXor(_eal, 4096 / 2)), m_ir->getInt8(0), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent); + + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rtime = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + + m_ir->CreateBr(_repeat_lock); + m_ir->SetInsertPoint(_repeat_lock); + + const auto rval = m_ir->CreatePHI(get_type(), 2); + rval->addIncoming(rtime, _begin_op); + + // Lock reservation + const auto cmp_res = m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateOr(rval, 0x7f), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent); + + m_ir->CreateCondBr(m_ir->CreateExtractValue(cmp_res, 1), _lock_success, _repeat_lock_fail, m_md_likely); + + m_ir->SetInsertPoint(_repeat_lock_fail); + + const auto last_rval = m_ir->CreateExtractValue(cmp_res, 0); + rval->addIncoming(last_rval, _repeat_lock_fail); + m_ir->CreateCondBr(is_accurate_op ? m_ir->CreateICmpEQ(last_rval, rval) : m_ir->CreateIsNull(m_ir->CreateAnd(last_rval, 0x7f)), _repeat_lock, _fail); + + m_ir->SetInsertPoint(_lock_success); + + // Commit 16 bytes compare-exchange + const auto sudo_ptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_sudo_addr)), _eal); + + m_ir->CreateCondBr( + m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(_ptr(sudo_ptr, diff), _rdata, _new, llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1) + , _success_and_unlock + , _fail_and_unlock); + + // Unlock and notify + m_ir->SetInsertPoint(_success_and_unlock); + m_ir->CreateAlignedStore(m_ir->CreateAdd(rval, m_ir->getInt64(128)), rptr, llvm::MaybeAlign{8}); + call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); + m_ir->CreateBr(_success); + + // Perform unlocked vm::reservation_update if no physical memory changes needed + m_ir->SetInsertPoint(_inc_res); + const auto rptr2 = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + + llvm::Value* old_val{}; + + if (is_accurate_op) + { + old_val = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + } + else + { + old_val = m_ir->CreateAlignedLoad(get_type(), rptr2, llvm::MaybeAlign{8}); + m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateAnd(old_val, 0x7f)), _success, _inc_res_unlocked); + m_ir->SetInsertPoint(_inc_res_unlocked); + } + + const auto cmp_res2 = m_ir->CreateAtomicCmpXchg(rptr2, old_val, m_ir->CreateAdd(old_val, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent); + + if (is_accurate_op) + { + m_ir->CreateCondBr(m_ir->CreateExtractValue(cmp_res2, 1), _success, _fail); + } + else + { + m_ir->CreateBr(_success); + } + + m_ir->SetInsertPoint(_success); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_fail_and_unlock); + m_ir->CreateAlignedStore(rval, rptr, llvm::MaybeAlign{8}); + m_ir->CreateBr(_fail); + + m_ir->SetInsertPoint(_fail); + call("PUTLLC16_fail", +on_fail, m_thread, _eal); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_final); + } + + void putllc0_pattern(const spu_program& prog, utils::address_range range) + { + // Prevent store elimination + m_block->store_context_ctr[s_reg_mfc_eal]++; + m_block->store_context_ctr[s_reg_mfc_lsa]++; + m_block->store_context_ctr[s_reg_mfc_tag]++; + m_block->store_context_ctr[s_reg_mfc_size]++; + + static const auto on_fail = [](spu_thread* _spu, u32 addr) + { + if (const u32 raddr = _spu->raddr) + { + // Last check for event before we clear the reservation + if (~_spu->ch_events.load().events & SPU_EVENT_LR) + { + if (raddr == addr) + { + _spu->set_events(SPU_EVENT_LR); + } + else + { + _spu->get_events(SPU_EVENT_LR); + } + } + _spu->raddr = 0; + } + }; + + const auto _next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function); + const auto _final = llvm::BasicBlock::Create(m_context, "", m_function); + + const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); + const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr)); + + m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _next, _fail, m_md_likely); + m_ir->SetInsertPoint(_next); + + value_t eal_val; + eal_val.value = _eal; + + const auto rptr = _ptr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::reserv_base_addr)), ((eal_val & 0xff80) >> 1).eval(m_ir)); + const auto rval = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::rtime)); + m_ir->CreateCondBr( + m_ir->CreateExtractValue(m_ir->CreateAtomicCmpXchg(rptr, rval, m_ir->CreateAdd(rval, m_ir->getInt64(128)), llvm::MaybeAlign{16}, llvm::AtomicOrdering::SequentiallyConsistent, llvm::AtomicOrdering::SequentiallyConsistent), 1) + , _next0 + , g_cfg.core.spu_accurate_reservations ? _fail : _next0); // Succeed unconditionally + + m_ir->SetInsertPoint(_next0); + //call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_SUCCESS), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_fail); + call("PUTLLC0_fail", +on_fail, m_thread, _eal); + m_ir->CreateStore(m_ir->getInt64(spu_channel::bit_count | MFC_PUTLLC_FAILURE), spu_ptr(&spu_thread::ch_atomic_stat)); + m_ir->CreateBr(_final); + + m_ir->SetInsertPoint(_final); + m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::raddr)); + } + public: spu_llvm_recompiler(u8 interp_magn = 0) : spu_recompiler_base() @@ -1622,6 +1898,26 @@ public: else m_next_op = func.data[(m_pos - start) / 4 + 1]; + switch (m_inst_attrs[(m_pos - start) / 4]) + { + case inst_attr::putllc0: + { + putllc0_pattern(func, m_patterns.at(m_pos - start).range); + continue; + } + case inst_attr::putllc16: + { + putllc16_pattern(func, m_patterns.at(m_pos - start).range); + continue; + } + case inst_attr::omit: + { + // TODO + continue; + } + default: break; + } + // Execute recompiler function (TODO) (this->*decode(op))({op}); } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index ba1dfce412..31e6022de0 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -2,6 +2,7 @@ #include "Utilities/File.h" #include "Utilities/lockless.h" +#include "Utilities/address_range.h" #include "SPUThread.h" #include #include @@ -189,6 +190,72 @@ public: interrupt_call, }; + // Value flags (TODO: only is_const is implemented) + enum class vf : u32 + { + is_const, + is_mask, + is_rel, + is_null, + + __bitset_enum_max + }; + + struct reg_state_t + { + bs_t flag{+vf::is_null}; + u32 value{}; + u32 tag = umax; + u32 known_ones{}; + u32 known_zeroes{}; + + bool is_const() const; + + bool operator&(vf to_test) const; + + bool is_less_than(u32 imm) const; + bool operator==(const reg_state_t& r) const; + bool operator==(u32 imm) const; + + // Compare equality but try to ignore changes in unmasked bits + bool compare_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const; + bool compare_with_mask_indifference(u32 imm, u32 mask_bits) const; + bool unequal_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const; + + reg_state_t downgrade() const; + reg_state_t merge(const reg_state_t& rhs) const; + reg_state_t build_on_top_of(const reg_state_t& rhs) const; + + u32 get_known_zeroes() const; + u32 get_known_ones() const; + + template + static std::conditional_t> make_unknown() noexcept + { + if constexpr (Count == 1) + { + reg_state_t v{}; + v.tag = alloc_tag(); + v.flag = {}; + return v; + } + else + { + std::array result{}; + + for (reg_state_t& state : result) + { + state = make_unknown<1>(); + } + + return result; + } + } + + static reg_state_t from_value(u32 value) noexcept; + static u32 alloc_tag(bool reset = false) noexcept; + }; + protected: spu_runtime* m_spurt{}; @@ -298,6 +365,27 @@ protected: // Sorted function info std::map m_funcs; + // TODO: Add patterns + // Not a bitset to allow more possibilities + enum class inst_attr : u8 + { + none, + omit, + putllc16, + putllc0, + }; + + std::vector m_inst_attrs; + + struct pattern_info + { + utils::address_range range; + }; + + std::unordered_map m_patterns; + + void add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end = -1); + private: // For private use std::bitset<0x10000> m_bits; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 10bc33fe4f..ad1d84ae93 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -3778,13 +3778,16 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) if (raddr) { // Last check for event before we clear the reservation - if (raddr == addr) + if (~ch_events.load().events & SPU_EVENT_LR) { - set_events(SPU_EVENT_LR); - } - else - { - get_events(SPU_EVENT_LR); + if (raddr == addr) + { + set_events(SPU_EVENT_LR); + } + else + { + get_events(SPU_EVENT_LR); + } } } diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 82cf4bc189..539617f30d 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -668,6 +668,8 @@ public: // May be used by recompilers. u8* memory_base_addr = vm::g_base_addr; + u8* memory_sudo_addr = vm::g_sudo_addr; + u8* reserv_base_addr = vm::g_reservations; // General-Purpose Registers std::array gpr;