From b307aff9eb093d3e21dbc30a672b92401f1caafb Mon Sep 17 00:00:00 2001 From: eladash Date: Fri, 22 Mar 2019 09:58:04 +0200 Subject: [PATCH] Prefetch byteswapped opcodes in ppu interpreter --- rpcs3/Emu/Cell/PPUThread.cpp | 105 ++++++++++++++----------------- rpcs3/Emu/Cell/PPUTranslator.cpp | 4 +- rpcs3/Emu/Memory/vm.cpp | 4 +- 3 files changed, 52 insertions(+), 61 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 8dab37663c..21e78f3a5e 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -169,13 +169,13 @@ static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_pa extern void ppu_execute_syscall(ppu_thread& ppu, u64 code); // Get pointer to executable cache -static u32& ppu_ref(u32 addr) +static u64& ppu_ref(u32 addr) { - return *reinterpret_cast(vm::g_exec_addr + addr); + return *reinterpret_cast(vm::g_exec_addr + (u64)addr * 2); } // Get interpreter cache value -static u32 ppu_cache(u32 addr) +static u64 ppu_cache(u32 addr) { // Select opcode table const auto& table = *( @@ -183,7 +183,8 @@ static u32 ppu_cache(u32 addr) g_cfg.core.ppu_decoder == ppu_decoder_type::fast ? &g_ppu_interpreter_fast.get_table() : (fmt::throw_exception("Invalid PPU decoder"), nullptr)); - return ::narrow(reinterpret_cast(table[ppu_decode(vm::read32(addr))])); + const u32 value = vm::read32(addr); + return (u64)value << 32 | ::narrow(reinterpret_cast(table[ppu_decode(value)])); } static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op) @@ -207,20 +208,19 @@ void ppu_recompiler_fallback(ppu_thread& ppu) } const auto& table = g_ppu_interpreter_fast.get_table(); - const auto base = vm::g_base_addr; const auto cache = vm::g_exec_addr; while (true) { // Run instructions in interpreter - if (const u32 op = *reinterpret_cast*>(base + ppu.cia); + if (const u32 op = *reinterpret_cast(cache + (u64)ppu.cia * 2 + 4); LIKELY(table[ppu_decode(op)](ppu, { op }))) { ppu.cia += 4; continue; } - if (uptr func = *reinterpret_cast(cache + ppu.cia); + if (uptr func = *reinterpret_cast(cache + (u64)ppu.cia * 2); func != reinterpret_cast(ppu_recompiler_fallback)) { // We found a recompiler function at cia, return @@ -269,7 +269,7 @@ extern void ppu_register_range(u32 addr, u32 size) } // Register executable range at - utils::memory_commit(&ppu_ref(addr), size, utils::protection::rw); + utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw); const u32 fallback = ::narrow(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? reinterpret_cast(ppu_recompiler_fallback) : reinterpret_cast(ppu_fallback)); @@ -277,7 +277,7 @@ extern void ppu_register_range(u32 addr, u32 size) size &= ~3; // Loop assumes `size = n * 4`, enforce that by rounding down while (size) { - ppu_ref(addr) = fallback; + ppu_ref(addr) = (u64)vm::read32(addr) << 32 | fallback; addr += 4; size -= 4; } @@ -288,7 +288,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr) // Initialize specific function if (ptr) { - ppu_ref(addr) = ::narrow(reinterpret_cast(ptr)); + *reinterpret_cast(&ppu_ref(addr)) = ::narrow(reinterpret_cast(ptr)); return; } @@ -312,7 +312,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr) while (size) { - if (ppu_ref(addr) == fallback) + if ((u32)ppu_ref(addr) == fallback) { ppu_ref(addr) = ppu_cache(addr); } @@ -357,7 +357,7 @@ extern void ppu_breakpoint(u32 addr, bool isAdding) if (isAdding) { // Set breakpoint - ppu_ref(addr) = _break; + *reinterpret_cast(&ppu_ref(addr)) = _break; } else { @@ -376,9 +376,9 @@ extern void ppu_set_breakpoint(u32 addr) const auto _break = ::narrow(reinterpret_cast(&ppu_break)); - if (ppu_ref(addr) != _break) + if ((u32)ppu_ref(addr) != _break) { - ppu_ref(addr) = _break; + *reinterpret_cast(&ppu_ref(addr)) = _break; } } @@ -392,7 +392,7 @@ extern void ppu_remove_breakpoint(u32 addr) const auto _break = ::narrow(reinterpret_cast(&ppu_break)); - if (ppu_ref(addr) == _break) + if ((u32)ppu_ref(addr) == _break) { ppu_ref(addr) = ppu_cache(addr); } @@ -420,7 +420,7 @@ extern bool ppu_patch(u32 addr, u32 value) const u32 _break = ::narrow(reinterpret_cast(&ppu_break)); const u32 fallback = ::narrow(reinterpret_cast(&ppu_fallback)); - if (ppu_ref(addr) != _break && ppu_ref(addr) != fallback) + if ((u32)ppu_ref(addr) != _break && (u32)ppu_ref(addr) != fallback) { ppu_ref(addr) = ppu_cache(addr); } @@ -622,81 +622,72 @@ void ppu_thread::exec_task() { while (!(state & (cpu_flag::ret + cpu_flag::exit + cpu_flag::stop + cpu_flag::dbg_global_stop))) { - reinterpret_cast(static_cast(ppu_ref(cia)))(*this); + reinterpret_cast(static_cast((u32)ppu_ref(cia)))(*this); } return; } - const auto base = vm::_ptr(0); const auto cache = vm::g_exec_addr; - const auto bswap4 = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); - - v128 _op; using func_t = decltype(&ppu_interpreter::UNK); - func_t func0, func1, func2, func3, func4, func5; while (true) { - if (UNLIKELY(state)) + const auto exec_op = [this](u64 op) { - if (check_state()) return; + return reinterpret_cast((uptr)(u32)op)(*this, {u32(op >> 32)}); + }; + + if (cia % 8 || !s_use_ssse3 || UNLIKELY(state)) + { + if (test_stopped()) return; // Decode single instruction (may be step) - const u32 op = *reinterpret_cast*>(base + cia); - if (reinterpret_cast((std::uintptr_t)ppu_ref(cia))(*this, {op})) { cia += 4; } + if (exec_op(*reinterpret_cast(cache + (u64)cia * 2))) { cia += 4; } continue; } - if (cia % 16 || !s_use_ssse3) - { - // Unaligned - const u32 op = *reinterpret_cast*>(base + cia); - if (reinterpret_cast((std::uintptr_t)ppu_ref(cia))(*this, {op})) { cia += 4; } - continue; - } + u64 op0, op1, op2, op3; + u64 _pos = (u64)cia * 2; // Reinitialize { - const v128 x = v128::fromV(_mm_load_si128(reinterpret_cast(cache + cia))); - func0 = reinterpret_cast((std::uintptr_t)x._u32[0]); - func1 = reinterpret_cast((std::uintptr_t)x._u32[1]); - func2 = reinterpret_cast((std::uintptr_t)x._u32[2]); - func3 = reinterpret_cast((std::uintptr_t)x._u32[3]); - _op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast(base + cia)), bswap4); + const v128 _op0 = *reinterpret_cast(cache + _pos); + const v128 _op1 = *reinterpret_cast(cache + _pos + 16); + op0 = _op0._u64[0]; + op1 = _op0._u64[1]; + op2 = _op1._u64[0]; + op3 = _op1._u64[1]; } - while (LIKELY(func0(*this, {_op._u32[0]}))) + while (LIKELY(exec_op(op0))) { cia += 4; - if (LIKELY(func1(*this, {_op._u32[1]}))) + if (LIKELY(exec_op(op1))) { cia += 4; - const v128 x = v128::fromV(_mm_load_si128(reinterpret_cast(cache + cia + 8))); - func0 = reinterpret_cast((std::uintptr_t)x._u32[0]); - func1 = reinterpret_cast((std::uintptr_t)x._u32[1]); - func4 = reinterpret_cast((std::uintptr_t)x._u32[2]); - func5 = reinterpret_cast((std::uintptr_t)x._u32[3]); - - if (LIKELY(func2(*this, {_op._u32[2]}))) + if (LIKELY(exec_op(op2))) { cia += 4; - if (LIKELY(func3(*this, {_op._u32[3]}))) + if (LIKELY(exec_op(op3))) { cia += 4; - func2 = func4; - func3 = func5; - if (UNLIKELY(state)) { break; } - _op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast(base + cia)), bswap4); + _pos += 32; + const v128 _op0 = *reinterpret_cast(cache + _pos); + const v128 _op1 = *reinterpret_cast(cache + _pos + 16); + op0 = _op0._u64[0]; + op1 = _op0._u64[1]; + op2 = _op1._u64[0]; + op3 = _op1._u64[1]; continue; } break; @@ -1296,7 +1287,7 @@ extern void ppu_initialize(const ppu_module& info) if (g_cfg.core.ppu_debug && func.size && func.toc != -1) { s_ppu_toc->emplace(func.addr, func.toc); - ppu_ref(func.addr) = ::narrow(reinterpret_cast(&ppu_check_toc)); + *reinterpret_cast(&ppu_ref(func.addr)) = ::narrow(reinterpret_cast(&ppu_check_toc)); } } @@ -1553,7 +1544,7 @@ extern void ppu_initialize(const ppu_module& info) #endif // Write version, hash, CPU, settings - fmt::append(obj_name, "v2-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); + fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); } if (Emu.IsStopped()) @@ -1652,7 +1643,7 @@ extern void ppu_initialize(const ppu_module& info) { const u64 addr = jit->get(fmt::format("__0x%x", block.first - reloc)); jit_mod.funcs.emplace_back(reinterpret_cast(addr)); - ppu_ref(block.first) = ::narrow(addr); + *reinterpret_cast(&ppu_ref(block.first)) = ::narrow(addr); } } } @@ -1683,7 +1674,7 @@ extern void ppu_initialize(const ppu_module& info) { if (block.second) { - ppu_ref(block.first) = ::narrow(reinterpret_cast(jit_mod.funcs[index++])); + *reinterpret_cast(&ppu_ref(block.first)) = ::narrow(reinterpret_cast(jit_mod.funcs[index++])); } } } diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 7650eb264f..f2ff8c1f64 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -49,7 +49,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_mod m_thread_type = StructType::create(m_context, thread_struct, "context_t"); // Callable - m_call = new GlobalVariable(*module, ArrayType::get(GetType(), 0x40000000)->getPointerTo(), true, GlobalValue::ExternalLinkage, 0, fmt::format("__cptr%x", gsuffix)); + m_call = new GlobalVariable(*module, ArrayType::get(GetType(), 0x80000000)->getPointerTo(), true, GlobalValue::ExternalLinkage, 0, fmt::format("__cptr%x", gsuffix)); m_call->setInitializer(ConstantPointerNull::get(cast(m_call->getType()->getPointerElementType()))); m_call->setExternallyInitialized(true); @@ -282,7 +282,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect) } } - const auto pos = m_ir->CreateLShr(indirect, 2, "", true); + const auto pos = m_ir->CreateShl(m_ir->CreateLShr(indirect, 2, "", true), 1, "", true); const auto ptr = m_ir->CreateGEP(m_ir->CreateLoad(m_call), {m_ir->getInt64(0), pos}); indirect = m_ir->CreateIntToPtr(m_ir->CreateLoad(ptr), type->getPointerTo()); } diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index c939a6f4c5..0e4fed623e 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -364,7 +364,7 @@ namespace vm if (flags & page_executable) { - utils::memory_commit(g_exec_addr + addr, size); + utils::memory_commit(g_exec_addr + addr * 2, size * 2); } if (g_cfg.core.ppu_debug) @@ -494,7 +494,7 @@ namespace vm if (is_exec) { - utils::memory_decommit(g_exec_addr + addr, size); + utils::memory_decommit(g_exec_addr + addr * 2, size * 2); } if (g_cfg.core.ppu_debug)