From 8a12aee01f5f5ed1915f0ecbb09292a7ddc29587 Mon Sep 17 00:00:00 2001 From: digant Date: Wed, 18 Dec 2024 20:33:34 +0100 Subject: [PATCH] UPDATE --- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 81 ++++++--- .../RSXProg/RSXFragmentTextureOps.glsl | 104 +++++++++++- rpcs3/Emu/RSX/RSXThread.cpp | 154 ++++++++++++++++-- rpcs3/util/asm.hpp | 2 +- 4 files changed, 301 insertions(+), 40 deletions(-) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index fcc21c09ae..9387527c8f 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -4666,35 +4666,44 @@ public: return zshuffle(std::forward(a), 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } + template + static llvm_calli rotqbybi(T&& a, U&& b) + { + return {"spu_rotqbybi", {std::forward(a), std::forward(b)}}; + } + void ROTQBYBI(spu_opcode_t op) { - const auto a = get_vr(op.ra); - - // Data with swapped endian from a load instruction - if (auto [ok, as] = match_expr(a, byteswap(match())); ok) + register_intrinsic("spu_rotqbybi", [&](llvm::CallInst* ci) { - const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const auto sh = sc + (splat_scalar(get_vr(op.rb)) >> 3); + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + // Data with swapped endian from a load instruction + if (auto [ok, as] = match_expr(a, byteswap(match())); ok) + { + const auto sc = build(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const auto sh = sc + (splat_scalar(b) >> 3); + + if (m_use_avx512_icl) + { + return eval(vpermb(as, sh)); + } + + return eval(pshufb(as, (sh & 0xf))); + } + const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const auto sh = sc - (splat_scalar(b) >> 3); if (m_use_avx512_icl) { - set_vr(op.rt, vpermb(as, sh)); - return; + return eval(vpermb(a, sh)); } - set_vr(op.rt, pshufb(as, (sh & 0xf))); - return; - } - const auto sc = build(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const auto sh = sc - (splat_scalar(get_vr(op.rb)) >> 3); + return eval(pshufb(a, (sh & 0xf))); + }); - if (m_use_avx512_icl) - { - set_vr(op.rt, vpermb(a, sh)); - return; - } - - set_vr(op.rt, pshufb(a, (sh & 0xf))); + set_vr(op.rt, rotqbybi(get_vr(op.ra), get_vr(op.rb))); } void ROTQMBYBI(spu_opcode_t op) @@ -4813,6 +4822,38 @@ public: void ROTQBI(spu_opcode_t op) { const auto a = get_vr(op.ra); + const auto ax = get_vr(op.ra); + const auto bx = get_vr(op.rb); + + // Combined bit and bytes shift + if (auto [ok, v0, v1] = match_expr(ax, rotqbybi(match(), match())); ok && v1.eq(bx)) + { + const auto b32 = get_vr(op.rb); + // Is the rotate less than 31 bits? + if (auto k = get_known_bits(b32); !!(k.Zero & 0x60)) + { + const auto b = splat_scalar(get_vr(op.rb)); + set_vr(op.rt, fshl(bitcast(v0), zshuffle(bitcast(v0), 3, 0, 1, 2), b)); + return; + } + + // Inverted shift count + if (auto [ok1, v10, v11] = match_expr(b32, match() - match()); ok1) + { + if (auto [ok2, data] = get_const_vector(v10.value, m_pos); ok2) + { + if (data == v128::from32p(0x80)) + { + if (auto k = get_known_bits(v11); !!(k.Zero & 0x60)) + { + set_vr(op.rt, fshr(zshuffle(bitcast(v0), 1, 2, 3, 0), bitcast(v0), splat_scalar(bitcast(v11)))); + return; + } + } + } + } + } + const auto b = splat_scalar(get_vr(op.rb) & 0x7); set_vr(op.rt, fshl(a, zshuffle(a, 3, 0, 1, 2), b)); } diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl index 16e420d547..e03fd3ea17 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl @@ -175,10 +175,69 @@ vec4 _texcoord_xform_shadow(const in vec4 coord4, const in sampler_info params) vec4 _sext_unorm8x4(const in vec4 x) { // TODO: Handle clamped sign-extension - const vec4 bits = floor(fma(x, vec4(255.f), vec4(0.5f))); - const bvec4 sign_check = lessThan(bits, vec4(128.f)); - const vec4 ret = _select(bits - 256.f, bits, sign_check); + const vec4 bits = floor(fma(x, vec4(255.f), vec4(0.5f))); // floor of ((a * b) + c) (e.g. 0.0 -> 0, 1.0 -> 255) + const bvec4 sign_check = lessThan(bits, vec4(128.f)); // 1 if a < b (e.g. 127 -> 1, 128 -> 0) + const vec4 ret = _select(bits - 256.f, bits, sign_check); // a if c is false, b if c is true return ret / 127.f; + +// const bvec4 clamped_check = lessThan(ret, vec4(-127.f)); // handle clamped sign-extension +// const vec4 ret2 = _select(ret, vec4(-127.f), clamped_check); +// return ret2 / 127.f; + +// return vec4(0.0f); + +// return ret; +// return ret / vec4(512.f, 512.f, 512.f, 512.f); +// return vec4(-1.f, -1.f, -1.f, 0.f); +/* + const vec4 bits = x; // max di a * b + c (es. 0.0 -> 0.0, 1.0 -> 255) + const bvec4 sign_check = lessThan(bits, vec4(128.f)); // 1 se a < b (es. 127.0 -> 1) + const vec4 ret = _select(bits - 256.f, bits, sign_check); // a if c false, b if c true + return ret / 127.f; +*/ +/* +// const vec4 bits = floor(fma(x, vec4(255.f), vec4(0.f))); // max di a*b+c (es. 0.0 -> 0.0, 1.0 -> 255) + const vec4 bits = fma(x, vec4(255.f), vec4(0.f)); // max di a*b+c (es. 0.0 -> 0.0, 1.0 -> 255) + const bvec4 sign_check = lessThan(bits, vec4(128.f)); // 1 se a < b (es. 127.0 -> 1) +// const vec4 ret = _select(vec4(127.f), bits, sign_check); // a if c false, b if c true + const vec4 ret = _select(bits - vec4(256.f), bits, sign_check); // a if c false, b if c true + return ret / 127.f; +*/ +/* + const bvec4 sign_check2 = lessThan(ret, vec4(-127.f)); + const vec4 ret2 = _select(ret, vec4(-127.f), sign_check2); +// return ret2 * 2; // -255 255 +// return ret2 / vec4(127.f); // -1.0 - 1.0 +// return ret2 + return ret / 256.f; +// return ret + 255.f; +// return ret * 256.f; + return ret * 1024.f; +*/ +/* + const vec4 bits = x; + const bvec4 sign_check = lessThan(bits, vec4(0.5f)); // 1 se a < b (es. 0.0 -> 1, 0.5 -> 0) + const vec4 ret = _select(bits - vec4(1.f), bits, sign_check); // a if c false, b if c true + return ret / vec4(0.5f); +*/ +/* + const vec4 bits = floor(fma(x, vec4(1.f), vec4(1.f))); // max di a*b+c (es. -1.0 -> 0.0, 1.0 -> 2.0) +// const vec4 bits = fma(x, vec4(1.f), vec4(1.f)); // max di a*b+c (es. -1.0 -> 0.0, 1.0 -> 2.0) + const bvec4 sign_check = lessThan(bits, vec4(1.f)); // 1 se a < b (es. 0.0 -> 1, 1.0 -> 0) +// const vec4 ret = _select(bits - vec4(1.f), bits + vec4(1.f), sign_check); // a if c false, b if c true + const vec4 ret = _select(bits - vec4(1.f), vec4(2.f) - bits, sign_check); // a if c false, b if c true + return ret / vec4(1.f); +*/ +/* + const bvec4 sign_check = lessThan(x, vec4(0.5f)); // 1 se a < b (es. 0.4 -> 1, 1.0 -> 0) + const vec4 ret = _select(x - vec4(1.f), x, sign_check); // a if c false, b if c true + return ret * vec4(2.f); +*/ +/* const bvec4 sign_check = lessThan(x, vec4(128.f)); // 1 se a < b (es. 127.0 -> 1) + const vec4 ret = _select(x - 256.f, x, sign_check); // a if c false, b if c true + return ret; +// return ret / 127.f; +*/ } vec4 _process_texel(in vec4 rgba, const in uint control_bits) @@ -208,7 +267,21 @@ vec4 _process_texel(in vec4 rgba, const in uint control_bits) uvec4 mask; vec4 convert; - uint op_mask = control_bits & uint(SIGN_EXPAND_MASK); + uint op_mask; +/* + op_mask = control_bits & uint(SEXT_MASK); + if (op_mask != 0u) + { + // Sign-extend the input signal + mask = uvec4(op_mask) & uvec4(SEXT_R_MASK, SEXT_G_MASK, SEXT_B_MASK, SEXT_A_MASK); + convert = _sext_unorm8x4(rgba); +// rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); + + rgba = convert * vec4(0.f, 0.f, 0.f, 1.f); + } +*/ + + op_mask = control_bits & uint(SIGN_EXPAND_MASK); if (op_mask != 0u) { // Expand to signed normalized by decompressing the signal @@ -217,22 +290,37 @@ vec4 _process_texel(in vec4 rgba, const in uint control_bits) rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); } + op_mask = control_bits & uint(SEXT_MASK); if (op_mask != 0u) { // Sign-extend the input signal mask = uvec4(op_mask) & uvec4(SEXT_R_MASK, SEXT_G_MASK, SEXT_B_MASK, SEXT_A_MASK); - convert = _sext_unorm8x4(rgba); - rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); - } +// convert = _sext_unorm8x4(rgba); +// rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); +// convert = (rgba + 1.f) / 2.f; +// convert = rgba * 2.f; +// convert = rgba / 16.f; + convert = rgba; + convert = _sext_unorm8x4(convert); +// convert = convert * vec4(1.f, 1.f, 1.f, 0.f); +// convert = (rgba + 1.f) / 2.f; + rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); + +// rgba = convert * vec4(0.f, 0.f, 0.f, 1.f); +// rgba = vec4(0.f, 0.f, 0.f, 0.f); + } + op_mask = control_bits & uint(GAMMA_CTRL_MASK); if (op_mask != 0u) { // Gamma correction mask = uvec4(op_mask) & uvec4(GAMMA_R_MASK, GAMMA_G_MASK, GAMMA_B_MASK, GAMMA_A_MASK); convert = srgb_to_linear(rgba); - return _select(rgba, convert, notEqual(mask, uvec4(0))); +// return _select(rgba, convert, notEqual(mask, uvec4(0))); + + rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); } return rgba; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 4abc484b5e..14d47b2347 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2634,24 +2634,145 @@ namespace rsx // NOTE: The ARGB8_signed flag means to reinterpret the raw bytes as signed. This is different than unsigned_remap=bias which does range decompression. // This is a separate method of setting the format to signed mode without doing so per-channel // Precedence = SNORM > GAMMA > UNSIGNED_REMAP (See Resistance 3 for GAMMA/BX2 relationship, UE3 for BX2 effect) - - const u32 argb8_signed = tex.argb_signed(); // _SNROM - const u32 gamma = tex.gamma() & ~argb8_signed; // _SRGB - const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL)? 0u : (~(gamma | argb8_signed) & 0xF); // _BX2 + const u32 argb8_signed_cur = tex.argb_signed(); + const u32 gamma_cur = tex.gamma(); + const bool isUrnMap = tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL; + const bool canRemap = ((tex.remap() >> 8) & 0xAA) == 0xAA; + + const u32 argb8_signed = tex.argb_signed(); // _SNROM + const u32 gamma = tex.gamma() & ~argb8_signed; // _SRGB + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : (~(gamma | argb8_signed) & 0xF); // _BX2 u32 argb8_convert = gamma; + + /* + // --- KO --- killzone ghost anche su soldati + const u32 gamma = tex.gamma(); // _SRGB + const u32 argb8_signed = tex.argb_signed() & ~gamma; // _SNROM + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : (~(gamma | argb8_signed) & 0xF); // _BX2 + u32 argb8_convert = gamma; + ** + /* + // --- KO --- killzone ghost anche su soldati + const u32 argb8_signed = tex.argb_signed(); + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : (~(argb8_signed) & 0xF); // _BX2 + const u32 gamma = tex.gamma() & ~(argb8_signed | unsigned_remap); + u32 argb8_convert = gamma; + */ + /* + // OK killzone + const u32 gamma = tex.gamma(); + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : (~(gamma) & 0xF); + const u32 argb8_signed = tex.argb_signed() & ~(gamma | unsigned_remap); + u32 argb8_convert = gamma; + */ + /* + // OK killzone + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : 0xF; + const u32 argb8_signed = tex.argb_signed() & ~unsigned_remap; + const u32 gamma = tex.gamma() & ~(argb8_signed | unsigned_remap); + u32 argb8_convert = gamma; + */ + /* + // OK killzone + const u32 unsigned_remap = (tex.unsigned_remap() == CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL) ? 0u : 0xF; + const u32 gamma = tex.gamma() & ~unsigned_remap; + const u32 argb8_signed = tex.argb_signed() & ~(gamma | unsigned_remap); + u32 argb8_convert = gamma; + */ + /* + // OK killzone, Partial OK Resistance + const u32 argb8_signed = argb8_signed_cur & 0x1; // _SNROM + const u32 gamma = gamma_cur & ~argb8_signed_cur; // _SRGB + const u32 unsigned_remap = (isUrnMap) ? 0u : (~(gamma | argb8_signed_cur) & 0xF); // _BX2 + u32 argb8_convert = gamma; + */ + /* + // --KO-- killzone, Partial OK Resistance + const u32 argb8_signed = !canRemap ? 0u : tex.argb_signed() & 0xF; + const u32 gamma = tex.gamma() & ~(argb8_signed); + const u32 unsigned_remap = isUrnMap ? 0u : (~(gamma | argb8_signed) & 0xF); + u32 argb8_convert = gamma; + */ + /* + // OK killzone, Partial OK Resistance + const u32 gamma = tex.gamma(); + const u32 unsigned_remap = isUrnMap ? 0u : (~(gamma) & 0xF); + const u32 argb8_signed = !canRemap ? 0u : tex.argb_signed() & ~(gamma | unsigned_remap) & 0xF; + u32 argb8_convert = gamma; + */ + /* + // OK killzone, Partial OK Resistance + const u32 unsigned_remap = isUrnMap ? 0u : 0xF; + const u32 argb8_signed = !canRemap ? 0u : tex.argb_signed() & ~unsigned_remap & 0xF; + const u32 gamma = tex.gamma() & ~(argb8_signed | unsigned_remap); + u32 argb8_convert = gamma; + */ + /* + // OK killzone, Partial OK Resistance + const u32 unsigned_remap = isUrnMap ? 0u : 0xF; + const u32 gamma = tex.gamma() & ~(unsigned_remap); + const u32 argb8_signed = !canRemap ? 0u : tex.argb_signed() & ~(gamma | unsigned_remap) & 0xF; + u32 argb8_convert = gamma; + */ + + + /* + const u32 gamma = tex.gamma(); + const u32 argb8_signed = tex.argb_signed() & ~(gamma) & 0xF; + const u32 unsigned_remap = isUrnMap ? 0u : (~(gamma | argb8_signed) & 0xF); + //const u32 unsigned_remap = isUrnMap ? 0u : (~(gamma) & 0xF); + //const u32 argb8_signed = tex.argb_signed() & ~(gamma | unsigned_remap) & 0xF; + u32 argb8_convert = gamma; + */ + /* + // OK killzone, POK Resistance + const u32 gamma = gamma_cur; // _SRGB +// const u32 argb8_signed = argb8_signed_cur & ~gamma; // _SNROM + const u32 argb8_signed = argb8_signed_cur & 0x1; // _SNROM +// const u32 gamma = gamma_cur & ~argb8_signed; // _SRGB + const u32 unsigned_remap = (isUrnMap) ? 0u : (~(gamma | argb8_signed) & 0xF); // _BX2 + u32 argb8_convert = gamma; + */ + /* + // OK killzone, POK Resistance + const u32 gamma = gamma_cur; // _SRGB + const u32 unsigned_remap = (isUrnMap) ? 0u : (~(gamma) & 0xF); // _BX2 + const u32 argb8_signed = argb8_signed_cur & ~(gamma | unsigned_remap); // _SNROM +// const u32 argb8_signed = argb8_signed_cur & 0x1; // _SNROM +// const u32 gamma = gamma_cur & ~argb8_signed; // _SRGB + u32 argb8_convert = gamma; + */ + /* + static u32 argb8_signed_prev = 0xFFFFFFFF; + static u32 gamma_prev = 0xFFFFFFFF; + static u32 unsigned_remap_prev = 0xFFFFFFFF; + + if (argb8_signed_cur || gamma_cur || unsigned_remap) + //if (argb8_signed_cur != argb8_signed_prev || gamma_cur != gamma_prev || unsigned_remap != unsigned_remap_prev) + //if (argb8_signed_cur && unsigned_remap) + //if (argb8_signed_cur) + { + rsx_log.error("signed = %d / %d gamma = %d / %d unsigned = %d / %d", argb8_signed, argb8_signed_cur, gamma, gamma_cur, unsigned_remap, isUrnMap); + + //argb8_signed_prev = argb8_signed_cur; + //gamma_prev = gamma_cur; + //unsigned_remap_prev = unsigned_remap; + } + */ + /* // The options are mutually exclusive ensure((argb8_signed & gamma) == 0); ensure((argb8_signed & unsigned_remap) == 0); ensure((gamma & unsigned_remap) == 0); - + */ // Helper function to apply a per-channel mask based on an input mask - const auto apply_sign_convert_mask = [&](u32 mask, u32 bit_offset) + const auto apply_sign_convert_mask = [&](u32 mask, u32 bit_offset, bool override) { // TODO: Use actual remap mask to account for 0 and 1 overrides in default mapping // TODO: Replace this clusterfuck of texture control with matrix transformation const auto remap_ctrl = (tex.remap() >> 8) & 0xAA; - if (remap_ctrl == 0xAA) + if (remap_ctrl == 0xAA || override) { argb8_convert |= (mask & 0xFu) << bit_offset; return; @@ -2661,20 +2782,31 @@ namespace rsx if ((remap_ctrl & 0x0C) == 0x08) argb8_convert |= (mask & 0x2u) << bit_offset; if ((remap_ctrl & 0x30) == 0x20) argb8_convert |= (mask & 0x4u) << bit_offset; if ((remap_ctrl & 0xC0) == 0x80) argb8_convert |= (mask & 0x8u) << bit_offset; + + //if ((remap_ctrl & 0x03)) argb8_convert |= (mask & 0x1u) << bit_offset; + //if ((remap_ctrl & 0x0C)) argb8_convert |= (mask & 0x2u) << bit_offset; + //if ((remap_ctrl & 0x30)) argb8_convert |= (mask & 0x4u) << bit_offset; + //if ((remap_ctrl & 0xC0)) argb8_convert |= (mask & 0x8u) << bit_offset; }; if (argb8_signed) { // Apply integer sign extension from uint8 to sint8 and renormalize - apply_sign_convert_mask(argb8_signed, texture_control_bits::SEXT_OFFSET); + apply_sign_convert_mask(argb8_signed, texture_control_bits::SEXT_OFFSET, false); } - + if (unsigned_remap) { // Apply sign expansion, compressed normal-map style (2n - 1) - apply_sign_convert_mask(unsigned_remap, texture_control_bits::EXPAND_OFFSET); + apply_sign_convert_mask(unsigned_remap, texture_control_bits::EXPAND_OFFSET, false); } - + + if (gamma) + { + // Apply sign expansion, compressed normal-map style (2n - 1) + apply_sign_convert_mask(gamma, texture_control_bits::GAMMA_A, false); + } + texture_control |= argb8_convert; } diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index bab63ccfc6..eee85ecc42 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -410,7 +410,7 @@ namespace utils return static_cast(value * u64{numerator} / u64{denominator}); } -#if is_u128_emulated +#ifdef _MSC_VER if constexpr (sizeof(T) <= sizeof(u128) / 2) { return static_cast(u128_from_mul(value, numerator) / u64{denominator});