diff --git a/rpcs3/Emu/Cell/PPUAnalyser.cpp b/rpcs3/Emu/Cell/PPUAnalyser.cpp index 35253bc30b..836897eae6 100644 --- a/rpcs3/Emu/Cell/PPUAnalyser.cpp +++ b/rpcs3/Emu/Cell/PPUAnalyser.cpp @@ -1177,6 +1177,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con func.size = 0x1C; func.blocks.emplace(func.addr, func.size); func.attr += ppu_attr::known_size; + known_functions.emplace(func.addr); // Look for another imports to fill gaps (hack) auto _p2 = _ptr + 7; @@ -1195,6 +1196,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con next.size = 0x1C; next.blocks.emplace(next.addr, next.size); next.attr += ppu_attr::known_size; + known_functions.emplace(_p2.addr()); advance(_p2, p2, 7); } @@ -1213,9 +1215,8 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con // Trampoline with TOC const u32 target = (ptr[3] << 16) + s16(ptr[4]); const u32 toc_add = (ptr[1] << 16) + s16(ptr[2]); - constexpr u32 func_size = 0x1C; - if (target >= start && target < end && verify_ref((_ptr + 3).addr()) && target - func.addr >= func_size) + if (target >= start && target < end && verify_ref((_ptr + 3).addr())) { auto& new_func = add_func(target, 0, func.addr); @@ -1774,8 +1775,23 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con u32 per_instruction_bytes = 0; - for (auto&& [_, func] : fmap) + // Iterate by address (fmap may grow) + for (u32 addr_next = start; addr_next != end;) { + // Get next iterator + const auto it = fmap.lower_bound(addr_next); + + if (it == fmap.end()) + { + break; + } + + // Save next function address as is as of this moment (ignoring added functions) + const auto it_next = std::next(it); + addr_next = it_next == fmap.end() ? end : it_next->first; + + const ppu_function_ext& func = it->second; + if (func.attr & ppu_attr::no_size && entry) { // Disabled for PRX for now @@ -1793,6 +1809,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con } per_instruction_bytes += utils::sub_saturate(lim, func.addr); + addr_next = std::max(addr_next, lim); continue; } @@ -1814,7 +1831,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con block.addr = addr; block.size = size; block.toc = func.toc; - ppu_log.trace("Block __0x%x added (func=0x%x, size=0x%x, toc=0x%x)", block.addr, _, block.size, block.toc); + ppu_log.trace("Block __0x%x added (func=0x%x, size=0x%x, toc=0x%x)", block.addr, it->first, block.size, block.toc); if (!entry && !sec_end) { diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index d815c8e322..1bbccb2d87 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -1652,7 +1652,7 @@ public: u32 elements; u32 dwords; - if (m_use_avx512 && g_cfg.core.full_width_avx512) + if (m_use_avx512) { stride = 64; elements = 16; @@ -1677,94 +1677,175 @@ public: llvm::Value* acc = nullptr; - for (u32 j = starta; j < end; j += stride) + // Use a 512bit simple checksum to verify integrity if size is atleast 512b * 3 + // This code uses a 512bit vector for all hardware to ensure behavior matches. + // The checksum path is still faster even on narrow hardware. + if ((end - starta) >= 192 && !g_cfg.core.precise_spu_verification) { - int indices[16]; - bool holes = false; - bool data = false; - - for (u32 i = 0; i < elements; i++) + for (u32 j = starta; j < end; j += 64) { - const u32 k = j + i * 4; + int indices[16]; + bool holes = false; + bool data = false; - if (k < start || k >= end || !func.data[(k - start) / 4]) + for (u32 i = 0; i < 16; i++) { - indices[i] = elements; - holes = true; + const u32 k = j + i * 4; + + if (k < start || k >= end || !func.data[(k - start) / 4]) + { + indices[i] = 16; + holes = true; + } + else + { + indices[i] = i; + data = true; + } } - else + + if (!data) { - indices[i] = i; - data = true; + // Skip full-sized holes + continue; } - } - if (!data) - { - // Skip full-sized holes - continue; - } + llvm::Value* vls = nullptr; - llvm::Value* vls = nullptr; - - // Load unaligned code block from LS - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { + // Load unaligned code block from LS vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + + // Mask if necessary + if (holes) + { + vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, 16)); + } + + acc = acc ? m_ir->CreateAdd(acc, vls) : vls; + check_iterations++; } - else if (m_use_avx) + + // Create the checksum + u32 checksum[16] = {0}; + + for (u32 j = 0; j < func.data.size(); j += 16) // Process 16 elements per iteration { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); - } - else - { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + for (u32 i = 0; i < 16; i++) + { + if (j + i < func.data.size()) + { + checksum[i] += func.data[j + i]; + } + } } - // Mask if necessary - if (holes) - { - vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); - } + auto* const_vector = ConstantDataVector::get(m_context, llvm::ArrayRef(checksum, 16)); + acc = m_ir->CreateXor(acc, const_vector); - // Perform bitwise comparison and accumulate - u32 words[16]; - - for (u32 i = 0; i < elements; i++) - { - const u32 k = j + i * 4; - words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; - } - - vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); - acc = acc ? m_ir->CreateOr(acc, vls) : vls; - check_iterations++; - } - - // Pattern for PTEST - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { + // Pattern for PTEST acc = m_ir->CreateBitCast(acc, get_type()); - } - else if (m_use_avx) - { - acc = m_ir->CreateBitCast(acc, get_type()); + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u32 i = 1; i < 8; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + // Compare result with zero + const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); + m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else { - acc = m_ir->CreateBitCast(acc, get_type()); + for (u32 j = starta; j < end; j += stride) + { + int indices[16]; + bool holes = false; + bool data = false; + + for (u32 i = 0; i < elements; i++) + { + const u32 k = j + i * 4; + + if (k < start || k >= end || !func.data[(k - start) / 4]) + { + indices[i] = elements; + holes = true; + } + else + { + indices[i] = i; + data = true; + } + } + + if (!data) + { + // Skip full-sized holes + continue; + } + + llvm::Value* vls = nullptr; + + // Load unaligned code block from LS + if (m_use_avx512) + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else if (m_use_avx) + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + + // Mask if necessary + if (holes) + { + vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); + } + + // Perform bitwise comparison and accumulate + u32 words[16]; + + for (u32 i = 0; i < elements; i++) + { + const u32 k = j + i * 4; + words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; + } + + vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); + acc = acc ? m_ir->CreateOr(acc, vls) : vls; + check_iterations++; + } + // Pattern for PTEST + if (m_use_avx512) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else if (m_use_avx) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u32 i = 1; i < dwords; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + // Compare result with zero + const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); + m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } - - llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); - - for (u32 i = 1; i < dwords; i++) - { - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); - } - - // Compare result with zero - const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); - m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } // Increase block counter with statistics diff --git a/rpcs3/Emu/Cell/lv2/sys_time.cpp b/rpcs3/Emu/Cell/lv2/sys_time.cpp index 92ac650f8d..b04be640bc 100644 --- a/rpcs3/Emu/Cell/lv2/sys_time.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_time.cpp @@ -147,7 +147,7 @@ u64 convert_to_timebased_time(u64 time) u64 get_timebased_time() { - if (0) if (u64 freq = utils::get_tsc_freq()) + if (u64 freq = utils::get_tsc_freq()) { const u64 tsc = utils::get_tsc(); @@ -207,7 +207,7 @@ void initialize_timebased_time(u64 timebased_init, bool reset) // Returns some relative time in microseconds, don't change this fact u64 get_system_time() { - if (0) if (u64 freq = utils::get_tsc_freq()) + if (u64 freq = utils::get_tsc_freq()) { const u64 tsc = utils::get_tsc(); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index d3dfab4ce8..0ede707dfc 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -68,7 +68,7 @@ struct cfg_root : cfg::node cfg::_enum spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false }; cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) - cfg::_bool full_width_avx512{ this, "Full Width AVX-512", true }; + cfg::_bool precise_spu_verification{ this, "Precise SPU Verification", false }; // Disables use of xorsum based spu verification if enabled. cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling. diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index d64a5e6553..8c958e389d 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -524,7 +524,7 @@ std::string utils::get_system_info() } else { - fmt::append(result, " | TSC: Bad"); + fmt::append(result, " | TSC: Disabled"); } if (has_avx()) @@ -772,15 +772,26 @@ static const bool s_tsc_freq_evaluated = []() -> bool #endif if (!utils::has_invariant_tsc()) + { return 0; + } + + if (utils::get_cpu_brand().find("Ryzen") != umax) + { + return 0; + } #ifdef _WIN32 LARGE_INTEGER freq; if (!QueryPerformanceFrequency(&freq)) + { return 0; + } if (freq.QuadPart <= 9'999'999) + { return 0; + } const ullong timer_freq = freq.QuadPart; #else @@ -880,7 +891,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool return round_tsc(res, utils::mul_saturate(utils::add_saturate(rdtsc_diff[0], rdtsc_diff[1]), utils::aligned_div(timer_freq, timer_data[1] - timer_data[0]))); }(); - atomic_storage::release(utils::s_tsc_freq, cal_tsc); + atomic_storage::store(utils::s_tsc_freq, cal_tsc); return true; }();