diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index d815c8e322..f6a803dd3d 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -1652,7 +1652,7 @@ public: u32 elements; u32 dwords; - if (m_use_avx512 && g_cfg.core.full_width_avx512) + if (m_use_avx512) { stride = 64; elements = 16; @@ -1677,94 +1677,177 @@ public: llvm::Value* acc = nullptr; - for (u32 j = starta; j < end; j += stride) + // Use 512bit xorsum to verify integrity if size is atleast 512b * 3 + // This code uses a 512bit vector for all hardware to ensure behavior matches. + // The xorsum path is still faster even on narrow hardware. + if ((end - starta) >= 192 && !g_cfg.core.precise_spu_verification) { - int indices[16]; - bool holes = false; - bool data = false; - - for (u32 i = 0; i < elements; i++) + for (u32 j = starta; j < end; j += 64) { - const u32 k = j + i * 4; + int indices[16]; + bool holes = false; + bool data = false; - if (k < start || k >= end || !func.data[(k - start) / 4]) + for (u32 i = 0; i < 16; i++) { - indices[i] = elements; - holes = true; + const u32 k = j + i * 4; + + if (k < start || k >= end || !func.data[(k - start) / 4]) + { + indices[i] = 16; + holes = true; + } + else + { + indices[i] = i; + data = true; + } } - else + + if (!data) { - indices[i] = i; - data = true; + // Skip full-sized holes + continue; } - } - if (!data) - { - // Skip full-sized holes - continue; - } + llvm::Value* vls = nullptr; - llvm::Value* vls = nullptr; - - // Load unaligned code block from LS - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { + // Load unaligned code block from LS vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + + // Mask if necessary + if (holes) + { + vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, 16)); + } + + acc = acc ? m_ir->CreateXor(acc, vls) : vls; + check_iterations++; } - else if (m_use_avx) + + // Create the Xorsum + u32 xorsum[16] = {0}; + + for (u32 j = 0; j < func.data.size(); j += 16) // Process 16 elements per iteration { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); - } - else - { - vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + for (u32 i = 0; i < 16; i++) + { + if (j + i < func.data.size()) + { + xorsum[i] ^= func.data[j + i]; + } + } } - // Mask if necessary - if (holes) - { - vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); - } + auto* const_vector = ConstantDataVector::get(m_context, llvm::ArrayRef(xorsum, 16)); + acc = m_ir->CreateXor(acc, const_vector); - // Perform bitwise comparison and accumulate - u32 words[16]; - - for (u32 i = 0; i < elements; i++) - { - const u32 k = j + i * 4; - words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; - } - - vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); - acc = acc ? m_ir->CreateOr(acc, vls) : vls; - check_iterations++; - } - - // Pattern for PTEST - if (m_use_avx512 && g_cfg.core.full_width_avx512) - { + // Pattern for PTEST acc = m_ir->CreateBitCast(acc, get_type()); - } - else if (m_use_avx) - { - acc = m_ir->CreateBitCast(acc, get_type()); + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u32 i = 1; i < 8; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + spu_log.error("end"); + + // Compare result with zero + const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); + m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else { - acc = m_ir->CreateBitCast(acc, get_type()); + for (u32 j = starta; j < end; j += stride) + { + int indices[16]; + bool holes = false; + bool data = false; + + for (u32 i = 0; i < elements; i++) + { + const u32 k = j + i * 4; + + if (k < start || k >= end || !func.data[(k - start) / 4]) + { + indices[i] = elements; + holes = true; + } + else + { + indices[i] = i; + data = true; + } + } + + if (!data) + { + // Skip full-sized holes + continue; + } + + llvm::Value* vls = nullptr; + + // Load unaligned code block from LS + if (m_use_avx512) + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else if (m_use_avx) + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else + { + vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + + // Mask if necessary + if (holes) + { + vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); + } + + // Perform bitwise comparison and accumulate + u32 words[16]; + + for (u32 i = 0; i < elements; i++) + { + const u32 k = j + i * 4; + words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; + } + + vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); + acc = acc ? m_ir->CreateOr(acc, vls) : vls; + check_iterations++; + } + // Pattern for PTEST + if (m_use_avx512) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else if (m_use_avx) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); + + for (u32 i = 1; i < dwords; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } + + // Compare result with zero + const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); + m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } - - llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); - - for (u32 i = 1; i < dwords; i++) - { - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); - } - - // Compare result with zero - const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); - m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } // Increase block counter with statistics diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index d3dfab4ce8..0ede707dfc 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -68,7 +68,7 @@ struct cfg_root : cfg::node cfg::_enum spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false }; cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) - cfg::_bool full_width_avx512{ this, "Full Width AVX-512", true }; + cfg::_bool precise_spu_verification{ this, "Precise SPU Verification", false }; // Disables use of xorsum based spu verification if enabled. cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.