SPU: internal refactoring, add spu_program

Use struct (spu_program) instead of std::vector<u32>.
This commit is contained in:
Nekotekina 2019-11-23 19:30:54 +03:00
parent 4caf747729
commit 1b9a3e6077
4 changed files with 215 additions and 213 deletions

View file

@ -45,9 +45,9 @@ void spu_recompiler::init()
}
}
spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
spu_function_t spu_recompiler::compile(spu_program&& _func)
{
const u32 start0 = _func[0];
const u32 start0 = _func.entry_point;
const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -61,9 +61,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
return add_loc->compiled;
}
const std::vector<u32>& func = add_loc->data;
const spu_program& func = add_loc->data;
if (func[0] != start0)
if (func.entry_point != start0)
{
// Wait for the duplicate
while (!add_loc->compiled)
@ -84,7 +84,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
be_t<u64> hash_start;
@ -168,18 +168,18 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
u32 words_align = 8;
// Start compilation
m_pos = func[0];
m_base = func[0];
m_size = ::size32(func) * 4 - 4;
m_pos = func.lower_bound;
m_base = func.entry_point;
m_size = ::size32(func.data) * 4;
const u32 start = m_pos;
const u32 end = start + m_size;
// Create block labels
for (u32 i = 1; i < func.size(); i++)
for (u32 i = 0; i < func.data.size(); i++)
{
if (func[i] && m_block_info[i - 1 + start / 4])
if (func.data[i] && m_block_info[i + start / 4])
{
instr_labels[i * 4 - 4 + start] = c->newLabel();
instr_labels[i * 4 + start] = c->newLabel();
}
}
@ -211,7 +211,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 addr = starta, m = 1; addr < enda && m; addr += 4, m <<= 1)
{
// Filter out if out of range, or is a hole
if (addr >= start && addr < end && func[(addr - start) / 4 + 1])
if (addr >= start && addr < end && func.data[(addr - start) / 4])
{
result |= m;
}
@ -226,7 +226,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
// Skip holes at the beginning (giga only)
for (u32 j = start; j < end; j += 4)
{
if (!func[(j - start) / 4 + 1])
if (!func.data[(j - start) / 4])
{
starta += 4;
}
@ -261,7 +261,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
}
else if (m_size == 8)
{
c->mov(x86::rax, static_cast<u64>(func[2]) << 32 | func[1]);
c->mov(x86::rax, static_cast<u64>(func.data[1]) << 32 | func.data[0]);
c->cmp(x86::rax, x86::qword_ptr(*ls, *pc0));
c->jnz(label_diff);
@ -272,7 +272,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
}
else if (m_size == 4)
{
c->cmp(x86::dword_ptr(*ls, *pc0), func[1]);
c->cmp(x86::dword_ptr(*ls, *pc0), func.data[0]);
c->jnz(label_diff);
if (utils::has_avx())
@ -351,7 +351,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = j; i < j + 64; i += 4)
{
words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
}
code_off += 64;
@ -391,7 +391,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = starta; i < enda; i += 4)
{
words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
}
}
else if (sizea == 2 && (end - start) <= 32)
@ -408,7 +408,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = starta; i < starta + 32; i += 4)
{
words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0);
}
}
else
@ -471,7 +471,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = j; i < j + 32; i += 4)
{
words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
}
code_off += 32;
@ -513,7 +513,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = starta; i < enda; i += 4)
{
words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
}
}
else if (sizea == 2 && (end - start) <= 32)
@ -530,7 +530,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = starta; i < starta + 32; i += 4)
{
words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
words.push_back(i >= start ? func.data[(i - start) / 4] : i + 32 < end ? func.data[(i + 32 - start) / 4] : 0);
}
}
else
@ -605,7 +605,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
for (u32 i = j; i < j + 32; i += 4)
{
words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
words.push_back(i >= start && i < end ? func.data[(i - start) / 4] : 0);
}
code_off += 32;
@ -675,10 +675,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
}
// Determine which value will be duplicated at hole positions
const u32 w3 = func.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3);
words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3);
words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3);
const u32 w3 = func.data.at((j - start + ~utils::cntlz32(cmask, true) % 4 * 4) / 4);
words.push_back(cmask & 1 ? func.data[(j - start + 0) / 4] : w3);
words.push_back(cmask & 2 ? func.data[(j - start + 4) / 4] : w3);
words.push_back(cmask & 4 ? func.data[(j - start + 8) / 4] : w3);
words.push_back(w3);
// PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word
@ -771,10 +771,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
m_pos = -1;
}
for (u32 i = 1; i < func.size(); i++)
for (u32 i = 0; i < func.data.size(); i++)
{
const u32 pos = start + (i - 1) * 4;
const u32 op = se_storage<u32>::swap(func[i]);
const u32 pos = start + i * 4;
const u32 op = std::bit_cast<be_t<u32>>(func.data[i]);
if (!op)
{
@ -908,7 +908,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& _func)
const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
// Rebuild trampoline if necessary
if (!m_spurt->rebuild_ubertrampoline(func[1]))
if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
{
return nullptr;
}

View file

@ -13,7 +13,7 @@ public:
virtual void init() override;
virtual spu_function_t compile(std::vector<u32>&&) override;
virtual spu_function_t compile(spu_program&&) override;
private:
// ASMJIT runtime

View file

@ -291,9 +291,9 @@ spu_cache::~spu_cache()
{
}
std::deque<std::vector<u32>> spu_cache::get()
std::deque<spu_program> spu_cache::get()
{
std::deque<std::vector<u32>> result;
std::deque<spu_program> result;
if (!m_file)
{
@ -314,41 +314,44 @@ std::deque<std::vector<u32>> spu_cache::get()
break;
}
func.resize(size + 1);
func[0] = addr;
func.resize(size);
if (m_file.read(func.data() + 1, func.size() * 4 - 4) != func.size() * 4 - 4)
if (m_file.read(func.data(), func.size() * 4) != func.size() * 4)
{
break;
}
if (!size || !func[1])
if (!size || !func[0])
{
// Skip old format Giga entries
continue;
}
result.emplace_front(std::move(func));
spu_program res;
res.entry_point = addr;
res.lower_bound = addr;
res.data = std::move(func);
result.emplace_front(std::move(res));
}
return result;
}
void spu_cache::add(const std::vector<u32>& func)
void spu_cache::add(const spu_program& func)
{
if (!m_file)
{
return;
}
be_t<u32> size = ::size32(func) - 1;
be_t<u32> addr = func[0];
be_t<u32> size = ::size32(func.data);
be_t<u32> addr = func.entry_point;
const fs::iovec_clone gather[3]
{
{&size, sizeof(size)},
{&addr, sizeof(addr)},
{func.data() + 1, func.size() * 4 - 4}
{func.data.data(), func.data.size() * 4}
};
// Append data
@ -458,7 +461,7 @@ void spu_cache::initialize()
// Build functions
for (std::size_t func_i = fnext++; func_i < func_list.size(); func_i = fnext++)
{
const std::vector<u32>& func = std::as_const(func_list)[func_i];
const spu_program& func = std::as_const(func_list)[func_i];
if (Emu.IsStopped() || fail_flag)
{
@ -467,21 +470,21 @@ void spu_cache::initialize()
}
// Get data start
const u32 start = func[0];
const u32 size0 = ::size32(func);
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
// Initialize LS with function data only
for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = se_storage<u32>::swap(func[i]);
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (!compiler->compile(std::move(func2)))
{
@ -523,51 +526,39 @@ void spu_cache::initialize()
g_fxo->init<spu_cache>(std::move(cache));
}
bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const
bool spu_program::operator==(const spu_program& rhs) const noexcept
{
if (lhs.empty())
return !rhs.empty();
else if (rhs.empty())
return false;
// TODO
return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data;
}
const u32 lhs_addr = lhs[0];
const u32 rhs_addr = rhs[0];
if (lhs_addr < rhs_addr)
return true;
else if (lhs_addr > rhs_addr)
return false;
bool spu_program::operator<(const spu_program& rhs) const noexcept
{
const u32 lhs_offs = (entry_point - lower_bound) / 4;
const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4;
// Select range for comparison
std::basic_string_view<u32> lhs_data(lhs.data() + 1, lhs.size() - 1);
std::basic_string_view<u32> rhs_data(rhs.data() + 1, rhs.size() - 1);
std::basic_string_view<u32> lhs_data(data.data() + lhs_offs, data.size() - lhs_offs);
std::basic_string_view<u32> rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs);
const auto cmp0 = lhs_data.compare(rhs_data);
if (lhs_data.empty())
return !rhs_data.empty();
else if (rhs_data.empty())
if (cmp0 < 0)
return true;
else if (cmp0 > 0)
return false;
if (false)
{
// In Giga mode, compare instructions starting from the entry point first
lhs_data.remove_prefix(lhs_addr / 4);
rhs_data.remove_prefix(rhs_addr / 4);
const auto cmp0 = lhs_data.compare(rhs_data);
// Compare from address 0 to the point before the entry point (TODO: undesirable)
lhs_data = {data.data(), lhs_offs};
rhs_data = {rhs.data.data(), rhs_offs};
const auto cmp1 = lhs_data.compare(rhs_data);
if (cmp0 < 0)
return true;
else if (cmp0 > 0)
return false;
if (cmp1 < 0)
return true;
else if (cmp1 > 0)
return false;
// Compare from address 0 to the point before the entry point (undesirable)
lhs_data = {lhs.data() + 1, lhs_addr / 4};
rhs_data = {rhs.data() + 1, rhs_addr / 4};
return lhs_data < rhs_data;
}
else
{
return lhs_data < rhs_data;
}
// TODO
return lhs_offs < rhs_offs;
}
spu_runtime::spu_runtime()
@ -590,9 +581,9 @@ spu_runtime::spu_runtime()
}
}
spu_item* spu_runtime::add_empty(std::vector<u32>&& data)
spu_item* spu_runtime::add_empty(spu_program&& data)
{
if (data.size() <= 1)
if (data.data.empty())
{
return nullptr;
}
@ -601,12 +592,9 @@ spu_item* spu_runtime::add_empty(std::vector<u32>&& data)
spu_item* prev = nullptr;
//Try to add item that doesn't exist yet
const auto ret = m_stuff[data[1] >> 12].push_if([&](spu_item& _new, spu_item& _old)
const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old)
{
std::basic_string_view<u32> lhs{_new.data.data() + 1, _new.data.size() - 1};
std::basic_string_view<u32> rhs{_old.data.data() + 1, _old.data.size() - 1};
if (lhs == rhs)
if (_new.data == _old.data)
{
prev = &_old;
return false;
@ -643,7 +631,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
{
if (const auto ptr = it->compiled.load())
{
std::basic_string_view<u32> range{it->data.data() + 1, it->data.size() - 1};
std::basic_string_view<u32> range{it->data.data.data(), it->data.data.size()};
range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4);
m_flat_list.emplace_back(range, ptr);
}
else
@ -997,7 +986,8 @@ spu_function_t spu_runtime::find(const u32* ls, u32 addr) const
{
if (const auto ptr = item.compiled.load())
{
std::basic_string_view<u32> range{item.data.data() + 1, item.data.size() - 1};
std::basic_string_view<u32> range{item.data.data.data(), item.data.data.size()};
range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4);
if (addr / 4 + range.size() > 0x10000)
{
@ -1194,12 +1184,13 @@ catch (const std::exception& e)
LOG_NOTICE(GENERAL, "\n%s", spu.dump());
}
std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point)
{
// Result: addr + raw instruction data
std::vector<u32> result;
result.reserve(10000);
result.push_back(entry_point);
spu_program result;
result.data.reserve(10000);
result.entry_point = entry_point;
result.lower_bound = entry_point;
// Initialize block entries
m_block_info.reset();
@ -1400,7 +1391,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
{
const u32 target = spu_branch_target(av);
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", result[0], pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
m_targets[pos].push_back(target);
@ -1408,7 +1399,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
{
if (sync)
{
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", result[0], pos, sl ? "" : "tail ", target);
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target);
if (target > entry_point)
{
@ -1505,17 +1496,17 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
if (jt_abs.size() >= jt_rel.size())
{
const u32 new_size = (start - lsa) / 4 + 1 + jt_abs.size();
const u32 new_size = (start - lsa) / 4 + jt_abs.size();
if (result.size() < new_size)
if (result.data.size() < new_size)
{
result.resize(new_size);
result.data.resize(new_size);
}
for (u32 i = 0; i < jt_abs.size(); i++)
{
add_block(jt_abs[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_abs[i]);
m_targets[start + i * 4];
}
@ -1524,17 +1515,17 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
if (jt_rel.size() >= jt_abs.size())
{
const u32 new_size = (start - lsa) / 4 + 1 + jt_rel.size();
const u32 new_size = (start - lsa) / 4 + jt_rel.size();
if (result.size() < new_size)
if (result.data.size() < new_size)
{
result.resize(new_size);
result.data.resize(new_size);
}
for (u32 i = 0; i < jt_rel.size(); i++)
{
add_block(jt_rel[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_rel[i] - start);
m_targets[start + i * 4];
}
@ -1569,7 +1560,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
}
else if (type == spu_itype::BI && sync)
{
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", result[0], pos);
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos);
}
if (type == spu_itype::BI || sl)
@ -1630,7 +1621,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
{
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", result[0], pos, target);
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target);
}
if (target > entry_point)
@ -1656,7 +1647,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
{
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", result[0], pos, target);
LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target);
}
if (target > entry_point)
@ -1914,31 +1905,33 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
}
// Insert raw instruction value
if (result.size() - 1 <= (pos - lsa) / 4)
const u32 new_size = (pos - lsa) / 4;
if (result.data.size() <= new_size)
{
if (result.size() - 1 < (pos - lsa) / 4)
if (result.data.size() < new_size)
{
result.resize((pos - lsa) / 4 + 1);
result.data.resize(new_size);
}
result.emplace_back(se_storage<u32>::swap(data));
result.data.emplace_back(std::bit_cast<u32, be_t<u32>>(data));
}
else if (u32& raw_val = result[(pos - lsa) / 4 + 1])
else if (u32& raw_val = result.data[new_size])
{
verify(HERE), raw_val == se_storage<u32>::swap(data);
verify(HERE), raw_val == std::bit_cast<u32, be_t<u32>>(data);
}
else
{
raw_val = se_storage<u32>::swap(data);
raw_val = std::bit_cast<u32, be_t<u32>>(data);
}
}
while (lsa > 0 || limit < 0x40000)
{
const u32 initial_size = result.size();
const u32 initial_size = result.data.size();
// Check unreachable blocks
limit = std::min<u32>(limit, lsa + initial_size * 4 - 4);
limit = std::min<u32>(limit, lsa + initial_size * 4);
for (auto& pair : m_preds)
{
@ -1961,7 +1954,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
for (u32 j = workload[i];; j -= 4)
{
// Go backward from an address until the entry point is reached
if (j == result[0])
if (j == entry_point)
{
reachable = true;
break;
@ -1994,7 +1987,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
// Check for possible fallthrough predecessor
if (!had_fallthrough)
{
if (result.at((j - lsa) / 4) == 0 || m_targets.count(j - 4))
if (result.data.at((j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4))
{
break;
}
@ -2018,16 +2011,16 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
}
}
result.resize((limit - lsa) / 4 + 1);
result.data.resize((limit - lsa) / 4);
// Check holes in safe mode (TODO)
u32 valid_size = 0;
for (u32 i = 1; i < result.size(); i++)
for (u32 i = 0; i < result.data.size(); i++)
{
if (result[i] == 0)
if (result.data[i] == 0)
{
const u32 pos = lsa + (i - 1) * 4;
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
// Allow only NOP or LNOP instructions in holes
@ -2038,34 +2031,34 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
result.resize(valid_size + 1);
result.data.resize(valid_size);
break;
}
}
else
{
valid_size = i;
valid_size = i + 1;
}
}
// Even if NOP or LNOP, should be removed at the end
result.resize(valid_size + 1);
result.data.resize(valid_size);
// Repeat if blocks were removed
if (result.size() == initial_size)
if (result.data.size() == initial_size)
{
break;
}
}
limit = std::min<u32>(limit, lsa + ::size32(result) * 4 - 4);
limit = std::min<u32>(limit, lsa + ::size32(result.data) * 4);
// Cleanup block info
for (u32 i = 0; i < workload.size(); i++)
{
const u32 addr = workload[i];
if (addr < lsa || addr >= limit || !result[(addr - lsa) / 4 + 1])
if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4])
{
m_block_info[addr / 4] = false;
m_entry_info[addr / 4] = false;
@ -2104,7 +2097,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
const u32 prev = (it->first - 4) & 0x3fffc;
// TODO: check the correctness
if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result[(prev - lsa) / 4 + 1])
if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4])
{
// Add target and the predecessor
m_targets[prev].push_back(it->first);
@ -2127,25 +2120,25 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
}
// Fill holes which contain only NOP and LNOP instructions (TODO: compile)
for (u32 i = 1, nnop = 0, vsize = 0; i <= result.size(); i++)
for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++)
{
if (i >= result.size() || result[i])
if (i >= result.data.size() || result.data[i])
{
if (nnop && nnop == i - vsize - 1)
if (nnop && nnop == i - vsize)
{
// Write only complete NOP sequence
for (u32 j = vsize + 1; j < i; j++)
for (u32 j = vsize; j < i; j++)
{
result[j] = se_storage<u32>::swap(ls[lsa / 4 + j - 1]);
result.data[j] = std::bit_cast<u32, be_t<u32>>(ls[lsa / 4 + j]);
}
}
nnop = 0;
vsize = i;
vsize = i + 1;
}
else
{
const u32 pos = lsa + (i - 1) * 4;
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
@ -2169,7 +2162,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
block.size++;
// Decode instruction
const spu_opcode_t op{se_storage<u32>::swap(result[(ia - lsa) / 4 + 1])};
const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
const auto type = s_spu_itype.decode(op.opcode);
@ -2663,7 +2656,7 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4)
{
// Decode instruction again
op.opcode = se_storage<u32>::swap(result[(ia - lsa) / 4 + 1]);
op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
last_inst = s_spu_itype.decode(op.opcode);
// Propagate some constants
@ -3117,24 +3110,18 @@ std::vector<u32> spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_poi
}
}
if (result.size() == 1)
if (result.data.empty())
{
// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback
result.clear();
}
return result;
}
void spu_recompiler_base::dump(const std::vector<u32>& result, std::string& out)
void spu_recompiler_base::dump(const spu_program& result, std::string& out)
{
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
dis_asm.offset = reinterpret_cast<const u8*>(result.data() + 1);
if (true)
{
dis_asm.offset -= result[0];
}
dis_asm.offset = reinterpret_cast<const u8*>(result.data.data()) - result.lower_bound;
std::string hash;
{
@ -3142,12 +3129,12 @@ void spu_recompiler_base::dump(const std::vector<u32>& result, std::string& out)
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(result.data() + 1), result.size() * 4 - 4);
sha1_update(&ctx, reinterpret_cast<const u8*>(result.data.data()), result.data.size() * 4);
sha1_finish(&ctx, output);
fmt::append(hash, "%s", fmt::base57(output));
}
fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result[0], result.size() - 1, hash);
fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n", result.entry_point, result.data.size(), hash);
for (auto& bb : m_bbs)
{
@ -4162,14 +4149,14 @@ public:
}
}
virtual spu_function_t compile(std::vector<u32>&& _func) override
virtual spu_function_t compile(spu_program&& _func) override
{
if (_func.empty() && m_interp_magn)
if (_func.data.empty() && m_interp_magn)
{
return compile_interpreter();
}
const u32 start0 = _func[0];
const u32 start0 = _func.entry_point;
const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -4178,9 +4165,9 @@ public:
return nullptr;
}
const std::vector<u32>& func = add_loc->data;
const spu_program& func = add_loc->data;
if (func[0] != start0)
if (func.entry_point != start0)
{
// Wait for the duplicate
while (!add_loc->compiled)
@ -4203,22 +4190,22 @@ public:
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
m_hash.clear();
fmt::append(m_hash, "spu-0x%05x-%s", func[0], fmt::base57(output));
fmt::append(m_hash, "spu-0x%05x-%s", func.entry_point, fmt::base57(output));
be_t<u64> hash_start;
std::memcpy(&hash_start, output, sizeof(hash_start));
m_hash_start = hash_start;
}
LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, m_hash);
LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash);
m_pos = func[0];
m_base = func[0];
m_size = (func.size() - 1) * 4;
m_pos = func.lower_bound;
m_base = func.entry_point;
m_size = ::size32(func.data) * 4;
const u32 start = m_pos;
const u32 end = start + m_size;
@ -4279,16 +4266,16 @@ public:
// Disable check (unsafe)
m_ir->CreateBr(label_body);
}
else if (func.size() - 1 == 1)
else if (func.data.size() == 1)
{
const auto pu32 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type<u32*>());
const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func[1]));
const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func.data[0]));
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
}
else if (func.size() - 1 == 2)
else if (func.data.size() == 2)
{
const auto pu64 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type<u64*>());
const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast<u64>(func[2]) << 32 | func[1]));
const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast<u64>(func.data[1]) << 32 | func.data[0]));
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
}
else
@ -4298,7 +4285,7 @@ public:
// Skip holes at the beginning (giga only)
for (u32 j = start; j < end; j += 4)
{
if (!func[(j - start) / 4 + 1])
if (!func.data[(j - start) / 4])
{
starta += 4;
}
@ -4324,7 +4311,7 @@ public:
{
const u32 k = j + i * 4;
if (k < start || k >= end || !func[(k - start) / 4 + 1])
if (k < start || k >= end || !func.data[(k - start) / 4])
{
indices[i] = 8;
holes = true;
@ -4357,7 +4344,7 @@ public:
for (u32 i = 0; i < 8; i++)
{
const u32 k = j + i * 4;
words[i] = k >= start && k < end ? func[(k - start) / 4 + 1] : 0;
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
}
vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
@ -4598,7 +4585,7 @@ public:
break;
}
const u32 op = se_storage<u32>::swap(func[(m_pos - start) / 4 + 1]);
const u32 op = std::bit_cast<be_t<u32>>(func.data[(m_pos - start) / 4]);
if (!op)
{
@ -4744,7 +4731,7 @@ public:
if (g_cfg.core.spu_debug)
{
fmt::append(log, "LLVM IR at 0x%x:\n", func[0]);
fmt::append(log, "LLVM IR at 0x%x:\n", func.entry_point);
out << *module; // print IR
out << "\n\n";
}
@ -4752,7 +4739,7 @@ public:
if (verifyModule(*module, &out))
{
out.flush();
LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func[0], log);
LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func.entry_point, log);
if (g_cfg.core.spu_debug)
{
@ -4781,7 +4768,7 @@ public:
add_loc->compiled = fn;
// Rebuild trampoline if necessary
if (!m_spurt->rebuild_ubertrampoline(func[1]))
if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
{
return nullptr;
}
@ -8398,7 +8385,7 @@ struct spu_llvm
}
// Start compiling
const std::vector<u32>& func = found_it->second->data;
const spu_program& func = found_it->second->data;
// Old function pointer (pre-recompiled)
const spu_function_t _old = found_it->second->compiled;
@ -8407,21 +8394,21 @@ struct spu_llvm
enqueued.erase(found_it);
// Get data start
const u32 start = func[0];
const u32 size0 = ::size32(func);
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
// Initialize LS with function data only
for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = se_storage<u32>::swap(func[i]);
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
std::vector<u32> func2 = compiler->analyse(ls.data(), func[0]);
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (const auto target = compiler->compile(std::move(func2)))
{
@ -8444,7 +8431,7 @@ struct spu_llvm
}
else
{
LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func[0]);
LOG_FATAL(SPU, "[0x%05x] Compilation failed.", func.entry_point);
Emu.Pause();
return;
}
@ -8469,7 +8456,7 @@ struct spu_fast : public spu_recompiler_base
}
}
virtual spu_function_t compile(std::vector<u32>&& _func) override
virtual spu_function_t compile(spu_program&& _func) override
{
const auto add_loc = m_spurt->add_empty(std::move(_func));
@ -8483,7 +8470,7 @@ struct spu_fast : public spu_recompiler_base
return add_loc->compiled;
}
const std::vector<u32>& func = add_loc->data;
const spu_program& func = add_loc->data;
if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
{
@ -8493,22 +8480,22 @@ struct spu_fast : public spu_recompiler_base
}
// Allocate executable area with necessary size
const auto result = jit_runtime::alloc(22 + 1 + 9 + (::size32(func) - 1) * (16 + 16) + 36 + 47, 16);
const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16);
if (!result)
{
return nullptr;
}
m_pos = func[0];
m_size = (::size32(func) - 1) * 4;
m_pos = func.lower_bound;
m_size = ::size32(func.data) * 4;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data() + 1), func.size() * 4 - 4);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
be_t<u64> hash_start;
@ -8554,9 +8541,9 @@ struct spu_fast : public spu_recompiler_base
*raw++ = 0x00;
// Verification (slow)
for (u32 i = 1; i < func.size(); i++)
for (u32 i = 0; i < func.data.size(); i++)
{
if (!func[i])
if (!func.data[i])
{
continue;
}
@ -8564,8 +8551,8 @@ struct spu_fast : public spu_recompiler_base
// cmp dword ptr [rcx + off], opc
*raw++ = 0x81;
*raw++ = 0xb9;
const u32 off = (i - 1) * 4;
const u32 opc = func[i];
const u32 off = i * 4;
const u32 opc = func.data[i];
std::memcpy(raw + 0, &off, 4);
std::memcpy(raw + 4, &opc, 4);
raw += 8;
@ -8627,16 +8614,16 @@ struct spu_fast : public spu_recompiler_base
*raw++ = 0x4c;
*raw++ = 0x8d;
*raw++ = 0x35;
const u32 epi_off = (::size32(func) - 1) * 16;
const u32 epi_off = ::size32(func.data) * 16;
std::memcpy(raw, &epi_off, 4);
raw += 4;
// Instructions (each instruction occupies fixed number of bytes)
for (u32 i = 1; i < func.size(); i++)
for (u32 i = 0; i < func.data.size(); i++)
{
const u32 pos = m_pos + (i - 1) * 4;
const u32 pos = m_pos + i * 4;
if (!func[i])
if (!func.data[i])
{
// Save pc: mov [rbp + spu_thread::pc], r12d
*raw++ = 0x44;
@ -8658,7 +8645,7 @@ struct spu_fast : public spu_recompiler_base
}
// Fix endianness
const spu_opcode_t op{se_storage<u32>::swap(func[i])};
const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
switch (auto type = s_spu_itype.decode(op.opcode))
{
@ -8797,7 +8784,7 @@ struct spu_fast : public spu_recompiler_base
}
// Rebuild trampoline if necessary
if (!m_spurt->rebuild_ubertrampoline(func[1]))
if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
{
return nullptr;
}

View file

@ -29,18 +29,39 @@ public:
return m_file.operator bool();
}
std::deque<std::vector<u32>> get();
std::deque<struct spu_program> get();
void add(const std::vector<u32>& func);
void add(const struct spu_program& func);
static void initialize();
};
struct spu_program
{
// Address of the entry point in LS
u32 entry_point;
// Address of the data in LS
u32 lower_bound;
// Program data with intentionally wrong endianness (on LE platform opcode values are swapped)
std::vector<u32> data;
bool operator==(const spu_program& rhs) const noexcept;
bool operator!=(const spu_program& rhs) const noexcept
{
return !(*this == rhs);
}
bool operator<(const spu_program& rhs) const noexcept;
};
class spu_item
{
public:
// SPU program
const std::vector<u32> data;
const spu_program data;
// Compiled function pointer
atomic_t<spu_function_t> compiled = nullptr;
@ -51,7 +72,7 @@ public:
atomic_t<u8> cached = false;
atomic_t<u8> logged = false;
spu_item(std::vector<u32>&& data)
spu_item(spu_program&& data)
: data(std::move(data))
{
}
@ -64,12 +85,6 @@ public:
// Helper class
class spu_runtime
{
struct func_compare
{
// Comparison function for SPU programs
bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
};
// All functions (2^20 bunches)
std::array<lf_bunch<spu_item>, (1 << 20)> m_stuff;
@ -109,7 +124,7 @@ private:
public:
// Return new pointer for add()
spu_item* add_empty(std::vector<u32>&&);
spu_item* add_empty(spu_program&&);
// Find existing function
spu_function_t find(const u32* ls, u32 addr) const;
@ -292,7 +307,7 @@ public:
virtual void init() = 0;
// Compile function
virtual spu_function_t compile(std::vector<u32>&&) = 0;
virtual spu_function_t compile(spu_program&&) = 0;
// Default dispatch function fallback (second arg is unused)
static void dispatch(spu_thread&, void*, u8* rip);
@ -304,10 +319,10 @@ public:
static void old_interpreter(spu_thread&, void* ls, u8*);
// Get the function data at specified address
std::vector<u32> analyse(const be_t<u32>* ls, u32 lsa);
spu_program analyse(const be_t<u32>* ls, u32 lsa);
// Print analyser internal state
void dump(const std::vector<u32>& result, std::string& out);
void dump(const spu_program& result, std::string& out);
// Get SPU Runtime
spu_runtime& get_runtime()