diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index b47d10016f..4567bc53ba 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -45,9 +45,9 @@ void spu_recompiler::init() } } -spu_function_t spu_recompiler::compile(std::vector&& _func) +spu_function_t spu_recompiler::compile(spu_program&& _func) { - const u32 start0 = _func[0]; + const u32 start0 = _func.entry_point; const auto add_loc = m_spurt->add_empty(std::move(_func)); @@ -61,9 +61,9 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) return add_loc->compiled; } - const std::vector& func = add_loc->data; + const spu_program& func = add_loc->data; - if (func[0] != start0) + if (func.entry_point != start0) { // Wait for the duplicate while (!add_loc->compiled) @@ -84,7 +84,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) u8 output[20]; sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data() + 1), func.size() * 4 - 4); + sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); sha1_finish(&ctx, output); be_t hash_start; @@ -168,18 +168,18 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) u32 words_align = 8; // Start compilation - m_pos = func[0]; - m_base = func[0]; - m_size = ::size32(func) * 4 - 4; + m_pos = func.lower_bound; + m_base = func.entry_point; + m_size = ::size32(func.data) * 4; const u32 start = m_pos; const u32 end = start + m_size; // Create block labels - for (u32 i = 1; i < func.size(); i++) + for (u32 i = 0; i < func.data.size(); i++) { - if (func[i] && m_block_info[i - 1 + start / 4]) + if (func.data[i] && m_block_info[i + start / 4]) { - instr_labels[i * 4 - 4 + start] = c->newLabel(); + instr_labels[i * 4 + start] = c->newLabel(); } } @@ -211,7 +211,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 addr = starta, m = 1; addr < enda && m; addr += 4, m <<= 1) { // Filter out if out of range, or is a hole - if (addr >= start && addr < end && func[(addr - start) / 4 + 1]) + if (addr >= start && addr < end && func.data[(addr - start) / 4]) { result |= m; } @@ -226,7 +226,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) // Skip holes at the beginning (giga only) for (u32 j = start; j < end; j += 4) { - if (!func[(j - start) / 4 + 1]) + if (!func.data[(j - start) / 4]) { starta += 4; } @@ -261,7 +261,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) } else if (m_size == 8) { - c->mov(x86::rax, static_cast(func[2]) << 32 | func[1]); + c->mov(x86::rax, static_cast(func.data[1]) << 32 | func.data[0]); c->cmp(x86::rax, x86::qword_ptr(*ls, *pc0)); c->jnz(label_diff); @@ -272,7 +272,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) } else if (m_size == 4) { - c->cmp(x86::dword_ptr(*ls, *pc0), func[1]); + c->cmp(x86::dword_ptr(*ls, *pc0), func.data[0]); c->jnz(label_diff); if (utils::has_avx()) @@ -351,7 +351,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = j; i < j + 64; i += 4) { - words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0); } code_off += 64; @@ -391,7 +391,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = starta; i < enda; i += 4) { - words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0); } } else if (sizea == 2 && (end - start) <= 32) @@ -408,7 +408,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = starta; i < starta + 32; i += 4) { - words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0); + words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0); } } else @@ -471,7 +471,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = j; i < j + 32; i += 4) { - words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0); } code_off += 32; @@ -513,7 +513,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = starta; i < enda; i += 4) { - words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0); } } else if (sizea == 2 && (end - start) <= 32) @@ -530,7 +530,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = starta; i < starta + 32; i += 4) { - words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0); + words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0); } } else @@ -605,7 +605,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) for (u32 i = j; i < j + 32; i += 4) { - words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0); } code_off += 32; @@ -675,10 +675,10 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) } // Determine which value will be duplicated at hole positions - const u32 w3 = func.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4 + 1); - words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3); - words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3); - words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3); + const u32 w3 = func.data.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4); + words.push_back(cmask & 1 ? func.data[(j - start + 0) / 4] : w3); + words.push_back(cmask & 2 ? func.data[(j - start + 4) / 4] : w3); + words.push_back(cmask & 4 ? func.data[(j - start + 8) / 4] : w3); words.push_back(w3); // PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word @@ -771,10 +771,10 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) m_pos = -1; } - for (u32 i = 1; i < func.size(); i++) + for (u32 i = 0; i < func.data.size(); i++) { - const u32 pos = start + (i - 1) * 4; - const u32 op = se_storage::swap(func[i]); + const u32 pos = start + i * 4; + const u32 op = std::bit_cast>(func.data[i]); if (!op) { @@ -908,7 +908,7 @@ spu_function_t spu_recompiler::compile(std::vector&& _func) const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn); // Rebuild trampoline if necessary - if (!m_spurt->rebuild_ubertrampoline(func[1])) + if (!m_spurt->rebuild_ubertrampoline(func.data[0])) { return nullptr; } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 4848ebfe5d..e56df3f79e 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -13,7 +13,7 @@ public: virtual void init() override; - virtual spu_function_t compile(std::vector&&) override; + virtual spu_function_t compile(spu_program&&) override; private: // ASMJIT runtime diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index a0cca6ae01..bdbb1849fa 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -291,9 +291,9 @@ spu_cache::~spu_cache() { } -std::deque> spu_cache::get() +std::deque spu_cache::get() { - std::deque> result; + std::deque result; if (!m_file) { @@ -314,41 +314,44 @@ std::deque> spu_cache::get() break; } - func.resize(size + 1); - func[0] = addr; + func.resize(size); - if (m_file.read(func.data() + 1, func.size() * 4 - 4) != func.size() * 4 - 4) + if (m_file.read(func.data(), func.size() * 4) != func.size() * 4) { break; } - if (!size || !func[1]) + if (!size || !func[0]) { // Skip old format Giga entries continue; } - result.emplace_front(std::move(func)); + spu_program res; + res.entry_point = addr; + res.lower_bound = addr; + res.data = std::move(func); + result.emplace_front(std::move(res)); } return result; } -void spu_cache::add(const std::vector& func) +void spu_cache::add(const spu_program& func) { if (!m_file) { return; } - be_t size = ::size32(func) - 1; - be_t addr = func[0]; + be_t size = ::size32(func.data); + be_t addr = func.entry_point; const fs::iovec_clone gather[3] { {&size, sizeof(size)}, {&addr, sizeof(addr)}, - {func.data() + 1, func.size() * 4 - 4} + {func.data.data(), func.data.size() * 4} }; // Append data @@ -458,7 +461,7 @@ void spu_cache::initialize() // Build functions for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++) { - const std::vector& func = std::as_const(func_list)[func_i]; + const spu_program& func = std::as_const(func_list)[func_i]; if (Emu.IsStopped() || fail_flag) { @@ -467,21 +470,21 @@ void spu_cache::initialize() } // Get data start - const u32 start = func[0]; - const u32 size0 = ::size32(func); + const u32 start = func.lower_bound; + const u32 size0 = ::size32(func.data); // Initialize LS with function data only - for (u32 i = 1, pos = start; i < size0; i++, pos += 4) + for (u32 i = 0, pos = start; i < size0; i++, pos += 4) { - ls[pos / 4] = se_storage::swap(func[i]); + ls[pos / 4] = std::bit_cast>(func.data[i]); } // Call analyser - std::vector func2 = compiler->analyse(ls.data(), func[0]); + spu_program func2 = compiler->analyse(ls.data(), func.entry_point); if (func2 != func) { - LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1); + LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0); } else if (!compiler->compile(std::move(func2))) { @@ -523,51 +526,39 @@ void spu_cache::initialize() g_fxo->init(std::move(cache)); } -bool spu_runtime::func_compare::operator()(const std::vector& lhs, const std::vector& rhs) const +bool spu_program::operator==(const spu_program& rhs) const noexcept { - if (lhs.empty()) - return !rhs.empty(); - else if (rhs.empty()) - return false; + // TODO + return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data; +} - const u32 lhs_addr = lhs[0]; - const u32 rhs_addr = rhs[0]; - - if (lhs_addr < rhs_addr) - return true; - else if (lhs_addr > rhs_addr) - return false; +bool spu_program::operator<(const spu_program& rhs) const noexcept +{ + const u32 lhs_offs = (entry_point - lower_bound) / 4; + const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4; // Select range for comparison - std::basic_string_view lhs_data(lhs.data() + 1, lhs.size() - 1); - std::basic_string_view rhs_data(rhs.data() + 1, rhs.size() - 1); + std::basic_string_view lhs_data(data.data() + lhs_offs, data.size() - lhs_offs); + std::basic_string_view rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs); + const auto cmp0 = lhs_data.compare(rhs_data); - if (lhs_data.empty()) - return !rhs_data.empty(); - else if (rhs_data.empty()) + if (cmp0 < 0) + return true; + else if (cmp0 > 0) return false; - if (false) - { - // In Giga mode, compare instructions starting from the entry point first - lhs_data.remove_prefix(lhs_addr / 4); - rhs_data.remove_prefix(rhs_addr / 4); - const auto cmp0 = lhs_data.compare(rhs_data); + // Compare from address 0 to the point before the entry point (TODO: undesirable) + lhs_data = {data.data(), lhs_offs}; + rhs_data = {rhs.data.data(), rhs_offs}; + const auto cmp1 = lhs_data.compare(rhs_data); - if (cmp0 < 0) - return true; - else if (cmp0 > 0) - return false; + if (cmp1 < 0) + return true; + else if (cmp1 > 0) + return false; - // Compare from address 0 to the point before the entry point (undesirable) - lhs_data = {lhs.data() + 1, lhs_addr / 4}; - rhs_data = {rhs.data() + 1, rhs_addr / 4}; - return lhs_data < rhs_data; - } - else - { - return lhs_data < rhs_data; - } + // TODO + return lhs_offs < rhs_offs; } spu_runtime::spu_runtime() @@ -590,9 +581,9 @@ spu_runtime::spu_runtime() } } -spu_item* spu_runtime::add_empty(std::vector&& data) +spu_item* spu_runtime::add_empty(spu_program&& data) { - if (data.size() <= 1) + if (data.data.empty()) { return nullptr; } @@ -601,12 +592,9 @@ spu_item* spu_runtime::add_empty(std::vector&& data) spu_item* prev = nullptr; //Try to add item that doesn't exist yet - const auto ret = m_stuff[data[1] >> 12].push_if([&](spu_item& _new, spu_item& _old) + const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old) { - std::basic_string_view lhs{_new.data.data() + 1, _new.data.size() - 1}; - std::basic_string_view rhs{_old.data.data() + 1, _old.data.size() - 1}; - - if (lhs == rhs) + if (_new.data == _old.data) { prev = &_old; return false; @@ -643,7 +631,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) { if (const auto ptr = it->compiled.load()) { - std::basic_string_view range{it->data.data() + 1, it->data.size() - 1}; + std::basic_string_view range{it->data.data.data(), it->data.data.size()}; + range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4); m_flat_list.emplace_back(range, ptr); } else @@ -997,7 +986,8 @@ spu_function_t spu_runtime::find(const u32* ls, u32 addr) const { if (const auto ptr = item.compiled.load()) { - std::basic_string_view range{item.data.data() + 1, item.data.size() - 1}; + std::basic_string_view range{item.data.data.data(), item.data.data.size()}; + range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4); if (addr / 4 + range.size() > 0x10000) { @@ -1194,12 +1184,13 @@ catch (const std::exception& e) LOG_NOTICE(GENERAL, "\n%s", spu.dump()); } -std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) +spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) { // Result: addr + raw instruction data - std::vector result; - result.reserve(10000); - result.push_back(entry_point); + spu_program result; + result.data.reserve(10000); + result.entry_point = entry_point; + result.lower_bound = entry_point; // Initialize block entries m_block_info.reset(); @@ -1400,7 +1391,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi { const u32 target = spu_branch_target(av); - LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", result[0], pos, target, op.d ? " (D)" : op.e ? " (E)" : ""); + LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : ""); m_targets[pos].push_back(target); @@ -1408,7 +1399,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi { if (sync) { - LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", result[0], pos, sl ? "" : "tail ", target); + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target); if (target > entry_point) { @@ -1505,17 +1496,17 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi if (jt_abs.size() >= jt_rel.size()) { - const u32 new_size = (start - lsa) / 4 + 1 + jt_abs.size(); + const u32 new_size = (start - lsa) / 4 + jt_abs.size(); - if (result.size() < new_size) + if (result.data.size() < new_size) { - result.resize(new_size); + result.data.resize(new_size); } for (u32 i = 0; i < jt_abs.size(); i++) { add_block(jt_abs[i]); - result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_abs[i]); + result.data[(start - lsa) / 4 + i] = std::bit_cast>(jt_abs[i]); m_targets[start + i * 4]; } @@ -1524,17 +1515,17 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi if (jt_rel.size() >= jt_abs.size()) { - const u32 new_size = (start - lsa) / 4 + 1 + jt_rel.size(); + const u32 new_size = (start - lsa) / 4 + jt_rel.size(); - if (result.size() < new_size) + if (result.data.size() < new_size) { - result.resize(new_size); + result.data.resize(new_size); } for (u32 i = 0; i < jt_rel.size(); i++) { add_block(jt_rel[i]); - result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_rel[i] - start); + result.data[(start - lsa) / 4 + i] = std::bit_cast>(jt_rel[i] - start); m_targets[start + i * 4]; } @@ -1569,7 +1560,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi } else if (type == spu_itype::BI && sync) { - LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", result[0], pos); + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos); } if (type == spu_itype::BI || sl) @@ -1630,7 +1621,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi { if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { - LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", result[0], pos, target); + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target); } if (target > entry_point) @@ -1656,7 +1647,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi { if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { - LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", result[0], pos, target); + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target); } if (target > entry_point) @@ -1914,31 +1905,33 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi } // Insert raw instruction value - if (result.size() - 1 <= (pos - lsa) / 4) + const u32 new_size = (pos - lsa) / 4; + + if (result.data.size() <= new_size) { - if (result.size() - 1 < (pos - lsa) / 4) + if (result.data.size() < new_size) { - result.resize((pos - lsa) / 4 + 1); + result.data.resize(new_size); } - result.emplace_back(se_storage::swap(data)); + result.data.emplace_back(std::bit_cast>(data)); } - else if (u32& raw_val = result[(pos - lsa) / 4 + 1]) + else if (u32& raw_val = result.data[new_size]) { - verify(HERE), raw_val == se_storage::swap(data); + verify(HERE), raw_val == std::bit_cast>(data); } else { - raw_val = se_storage::swap(data); + raw_val = std::bit_cast>(data); } } while (lsa > 0 || limit < 0x40000) { - const u32 initial_size = result.size(); + const u32 initial_size = result.data.size(); // Check unreachable blocks - limit = std::min(limit, lsa + initial_size * 4 - 4); + limit = std::min(limit, lsa + initial_size * 4); for (auto& pair : m_preds) { @@ -1961,7 +1954,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi for (u32 j = workload[i];; j -= 4) { // Go backward from an address until the entry point is reached - if (j == result[0]) + if (j == entry_point) { reachable = true; break; @@ -1994,7 +1987,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi // Check for possible fallthrough predecessor if (!had_fallthrough) { - if (result.at((j - lsa) / 4) == 0 || m_targets.count(j - 4)) + if (result.data.at((j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4)) { break; } @@ -2018,16 +2011,16 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi } } - result.resize((limit - lsa) / 4 + 1); + result.data.resize((limit - lsa) / 4); // Check holes in safe mode (TODO) u32 valid_size = 0; - for (u32 i = 1; i < result.size(); i++) + for (u32 i = 0; i < result.data.size(); i++) { - if (result[i] == 0) + if (result.data[i] == 0) { - const u32 pos = lsa + (i - 1) * 4; + const u32 pos = lsa + i * 4; const u32 data = ls[pos / 4]; // Allow only NOP or LNOP instructions in holes @@ -2038,34 +2031,34 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi if (g_cfg.core.spu_block_size != spu_block_size_type::giga) { - result.resize(valid_size + 1); + result.data.resize(valid_size); break; } } else { - valid_size = i; + valid_size = i + 1; } } // Even if NOP or LNOP, should be removed at the end - result.resize(valid_size + 1); + result.data.resize(valid_size); // Repeat if blocks were removed - if (result.size() == initial_size) + if (result.data.size() == initial_size) { break; } } - limit = std::min(limit, lsa + ::size32(result) * 4 - 4); + limit = std::min(limit, lsa + ::size32(result.data) * 4); // Cleanup block info for (u32 i = 0; i < workload.size(); i++) { const u32 addr = workload[i]; - if (addr < lsa || addr >= limit || !result[(addr - lsa) / 4 + 1]) + if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4]) { m_block_info[addr / 4] = false; m_entry_info[addr / 4] = false; @@ -2104,7 +2097,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi const u32 prev = (it->first - 4) & 0x3fffc; // TODO: check the correctness - if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result[(prev - lsa) / 4 + 1]) + if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4]) { // Add target and the predecessor m_targets[prev].push_back(it->first); @@ -2127,25 +2120,25 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi } // Fill holes which contain only NOP and LNOP instructions (TODO: compile) - for (u32 i = 1, nnop = 0, vsize = 0; i <= result.size(); i++) + for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++) { - if (i >= result.size() || result[i]) + if (i >= result.data.size() || result.data[i]) { - if (nnop && nnop == i - vsize - 1) + if (nnop && nnop == i - vsize) { // Write only complete NOP sequence - for (u32 j = vsize + 1; j < i; j++) + for (u32 j = vsize; j < i; j++) { - result[j] = se_storage::swap(ls[lsa / 4 + j - 1]); + result.data[j] = std::bit_cast>(ls[lsa / 4 + j]); } } nnop = 0; - vsize = i; + vsize = i + 1; } else { - const u32 pos = lsa + (i - 1) * 4; + const u32 pos = lsa + i * 4; const u32 data = ls[pos / 4]; if (data == 0x200000 || (data & 0xffffff80) == 0x40200000) @@ -2169,7 +2162,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi block.size++; // Decode instruction - const spu_opcode_t op{se_storage::swap(result[(ia - lsa) / 4 + 1])}; + const spu_opcode_t op{std::bit_cast>(result.data[(ia - lsa) / 4])}; const auto type = s_spu_itype.decode(op.opcode); @@ -2663,7 +2656,7 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4) { // Decode instruction again - op.opcode = se_storage::swap(result[(ia - lsa) / 4 + 1]); + op.opcode = std::bit_cast>(result.data[(ia - lsa) / 41]); last_inst = s_spu_itype.decode(op.opcode); // Propagate some constants @@ -3117,24 +3110,18 @@ std::vector spu_recompiler_base::analyse(const be_t* ls, u32 entry_poi } } - if (result.size() == 1) + if (result.data.empty()) { // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback - result.clear(); } return result; } -void spu_recompiler_base::dump(const std::vector& result, std::string& out) +void spu_recompiler_base::dump(const spu_program& result, std::string& out) { SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); - dis_asm.offset = reinterpret_cast(result.data() + 1); - - if (true) - { - dis_asm.offset -= result[0]; - } + dis_asm.offset = reinterpret_cast(result.data.data()) - result.lower_bound; std::string hash; { @@ -3142,12 +3129,12 @@ void spu_recompiler_base::dump(const std::vector& result, std::string& out) u8 output[20]; sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(result.data() + 1), result.size() * 4 - 4); + sha1_update(&ctx, reinterpret_cast(result.data.data()), result.data.size() * 4); sha1_finish(&ctx, output); fmt::append(hash, "%s", fmt::base57(output)); } - fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result[0], result.size() - 1, hash); + fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result.entry_point, result.data.size(), hash); for (auto& bb : m_bbs) { @@ -4162,14 +4149,14 @@ public: } } - virtual spu_function_t compile(std::vector&& _func) override + virtual spu_function_t compile(spu_program&& _func) override { - if (_func.empty() && m_interp_magn) + if (_func.data.empty() && m_interp_magn) { return compile_interpreter(); } - const u32 start0 = _func[0]; + const u32 start0 = _func.entry_point; const auto add_loc = m_spurt->add_empty(std::move(_func)); @@ -4178,9 +4165,9 @@ public: return nullptr; } - const std::vector& func = add_loc->data; + const spu_program& func = add_loc->data; - if (func[0] != start0) + if (func.entry_point != start0) { // Wait for the duplicate while (!add_loc->compiled) @@ -4203,22 +4190,22 @@ public: u8 output[20]; sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data() + 1), func.size() * 4 - 4); + sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); sha1_finish(&ctx, output); m_hash.clear(); - fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output)); + fmt::append(m_hash, "spu-0x%05x-%s", func.entry_point, fmt::base57(output)); be_t hash_start; std::memcpy(&hash_start, output, sizeof(hash_start)); m_hash_start = hash_start; } - LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, m_hash); + LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash); - m_pos = func[0]; - m_base = func[0]; - m_size = (func.size() - 1) * 4; + m_pos = func.lower_bound; + m_base = func.entry_point; + m_size = ::size32(func.data) * 4; const u32 start = m_pos; const u32 end = start + m_size; @@ -4279,16 +4266,16 @@ public: // Disable check (unsafe) m_ir->CreateBr(label_body); } - else if (func.size() - 1 == 1) + else if (func.data.size() == 1) { const auto pu32 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type()); - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func[1])); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func.data[0])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } - else if (func.size() - 1 == 2) + else if (func.data.size() == 2) { const auto pu64 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type()); - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast(func[2]) << 32 | func[1])); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast(func.data[1]) << 32 | func.data[0])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else @@ -4298,7 +4285,7 @@ public: // Skip holes at the beginning (giga only) for (u32 j = start; j < end; j += 4) { - if (!func[(j - start) / 4 + 1]) + if (!func.data[(j - start) / 4]) { starta += 4; } @@ -4324,7 +4311,7 @@ public: { const u32 k = j + i * 4; - if (k < start || k >= end || !func[(k - start) / 4 + 1]) + if (k < start || k >= end || !func.data[(k - start) / 4]) { indices[i] = 8; holes = true; @@ -4357,7 +4344,7 @@ public: for (u32 i = 0; i < 8; i++) { const u32 k = j + i * 4; - words[i] = k >= start && k < end ? func[(k - start) / 4 + 1] : 0; + words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; } vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words)); @@ -4598,7 +4585,7 @@ public: break; } - const u32 op = se_storage::swap(func[(m_pos - start) / 4 + 1]); + const u32 op = std::bit_cast>(func.data[(m_pos - start) / 4]); if (!op) { @@ -4744,7 +4731,7 @@ public: if (g_cfg.core.spu_debug) { - fmt::append(log, "LLVM IR at 0x%x:\n", func[0]); + fmt::append(log, "LLVM IR at 0x%x:\n", func.entry_point); out << *module; // print IR out << "\n\n"; } @@ -4752,7 +4739,7 @@ public: if (verifyModule(*module, &out)) { out.flush(); - LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func[0], log); + LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func.entry_point, log); if (g_cfg.core.spu_debug) { @@ -4781,7 +4768,7 @@ public: add_loc->compiled = fn; // Rebuild trampoline if necessary - if (!m_spurt->rebuild_ubertrampoline(func[1])) + if (!m_spurt->rebuild_ubertrampoline(func.data[0])) { return nullptr; } @@ -8398,7 +8385,7 @@ struct spu_llvm } // Start compiling - const std::vector& func = found_it->second->data; + const spu_program& func = found_it->second->data; // Old function pointer (pre-recompiled) const spu_function_t _old = found_it->second->compiled; @@ -8407,21 +8394,21 @@ struct spu_llvm enqueued.erase(found_it); // Get data start - const u32 start = func[0]; - const u32 size0 = ::size32(func); + const u32 start = func.lower_bound; + const u32 size0 = ::size32(func.data); // Initialize LS with function data only - for (u32 i = 1, pos = start; i < size0; i++, pos += 4) + for (u32 i = 0, pos = start; i < size0; i++, pos += 4) { - ls[pos / 4] = se_storage::swap(func[i]); + ls[pos / 4] = std::bit_cast>(func.data[i]); } // Call analyser - std::vector func2 = compiler->analyse(ls.data(), func[0]); + spu_program func2 = compiler->analyse(ls.data(), func.entry_point); if (func2 != func) { - LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1); + LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0); } else if (const auto target = compiler->compile(std::move(func2))) { @@ -8444,7 +8431,7 @@ struct spu_llvm } else { - LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func[0]); + LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func.entry_point); Emu.Pause(); return; } @@ -8469,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base } } - virtual spu_function_t compile(std::vector&& _func) override + virtual spu_function_t compile(spu_program&& _func) override { const auto add_loc = m_spurt->add_empty(std::move(_func)); @@ -8483,7 +8470,7 @@ struct spu_fast : public spu_recompiler_base return add_loc->compiled; } - const std::vector& func = add_loc->data; + const spu_program& func = add_loc->data; if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) { @@ -8493,22 +8480,22 @@ struct spu_fast : public spu_recompiler_base } // Allocate executable area with necessary size - const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16); + const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16); if (!result) { return nullptr; } - m_pos = func[0]; - m_size = (::size32(func) - 1) * 4; + m_pos = func.lower_bound; + m_size = ::size32(func.data) * 4; { sha1_context ctx; u8 output[20]; sha1_starts(&ctx); - sha1_update(&ctx, reinterpret_cast(func.data() + 1), func.size() * 4 - 4); + sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); sha1_finish(&ctx, output); be_t hash_start; @@ -8554,9 +8541,9 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x00; // Verification (slow) - for (u32 i = 1; i < func.size(); i++) + for (u32 i = 0; i < func.data.size(); i++) { - if (!func[i]) + if (!func.data[i]) { continue; } @@ -8564,8 +8551,8 @@ struct spu_fast : public spu_recompiler_base // cmp dword ptr [rcx + off], opc *raw++ = 0x81; *raw++ = 0xb9; - const u32 off = (i - 1) * 4; - const u32 opc = func[i]; + const u32 off = i * 4; + const u32 opc = func.data[i]; std::memcpy(raw + 0, &off, 4); std::memcpy(raw + 4, &opc, 4); raw += 8; @@ -8627,16 +8614,16 @@ struct spu_fast : public spu_recompiler_base *raw++ = 0x4c; *raw++ = 0x8d; *raw++ = 0x35; - const u32 epi_off = (::size32(func) - 1) * 16; + const u32 epi_off = ::size32(func.data) * 16; std::memcpy(raw, &epi_off, 4); raw += 4; // Instructions (each instruction occupies fixed number of bytes) - for (u32 i = 1; i < func.size(); i++) + for (u32 i = 0; i < func.data.size(); i++) { - const u32 pos = m_pos + (i - 1) * 4; + const u32 pos = m_pos + i * 4; - if (!func[i]) + if (!func.data[i]) { // Save pc: mov [rbp + spu_thread::pc], r12d *raw++ = 0x44; @@ -8658,7 +8645,7 @@ struct spu_fast : public spu_recompiler_base } // Fix endianness - const spu_opcode_t op{se_storage::swap(func[i])}; + const spu_opcode_t op{std::bit_cast>(func.data[i])}; switch (auto type = s_spu_itype.decode(op.opcode)) { @@ -8797,7 +8784,7 @@ struct spu_fast : public spu_recompiler_base } // Rebuild trampoline if necessary - if (!m_spurt->rebuild_ubertrampoline(func[1])) + if (!m_spurt->rebuild_ubertrampoline(func.data[0])) { return nullptr; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 5a8fe3f136..ce4ac19c90 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -29,18 +29,39 @@ public: return m_file.operator bool(); } - std::deque> get(); + std::deque get(); - void add(const std::vector& func); + void add(const struct spu_program& func); static void initialize(); }; +struct spu_program +{ + // Address of the entry point in LS + u32 entry_point; + + // Address of the data in LS + u32 lower_bound; + + // Program data with intentionally wrong endianness (on LE platform opcode values are swapped) + std::vector data; + + bool operator==(const spu_program& rhs) const noexcept; + + bool operator!=(const spu_program& rhs) const noexcept + { + return !(*this == rhs); + } + + bool operator<(const spu_program& rhs) const noexcept; +}; + class spu_item { public: // SPU program - const std::vector data; + const spu_program data; // Compiled function pointer atomic_t compiled = nullptr; @@ -51,7 +72,7 @@ public: atomic_t cached = false; atomic_t logged = false; - spu_item(std::vector&& data) + spu_item(spu_program&& data) : data(std::move(data)) { } @@ -64,12 +85,6 @@ public: // Helper class class spu_runtime { - struct func_compare - { - // Comparison function for SPU programs - bool operator()(const std::vector& lhs, const std::vector& rhs) const; - }; - // All functions (2^20 bunches) std::array, (1 << 20)> m_stuff; @@ -109,7 +124,7 @@ private: public: // Return new pointer for add() - spu_item* add_empty(std::vector&&); + spu_item* add_empty(spu_program&&); // Find existing function spu_function_t find(const u32* ls, u32 addr) const; @@ -292,7 +307,7 @@ public: virtual void init() = 0; // Compile function - virtual spu_function_t compile(std::vector&&) = 0; + virtual spu_function_t compile(spu_program&&) = 0; // Default dispatch function fallback (second arg is unused) static void dispatch(spu_thread&, void*, u8* rip); @@ -304,10 +319,10 @@ public: static void old_interpreter(spu_thread&, void* ls, u8*); // Get the function data at specified address - std::vector analyse(const be_t* ls, u32 lsa); + spu_program analyse(const be_t* ls, u32 lsa); // Print analyser internal state - void dump(const std::vector& result, std::string& out); + void dump(const spu_program& result, std::string& out); // Get SPU Runtime spu_runtime& get_runtime()