mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-21 03:55:32 +00:00
Improve approximate xfloat
- Disable denormals for SPU threads - Add clamping helpers
This commit is contained in:
parent
3b46c9cb6a
commit
a36f0497ce
2 changed files with 59 additions and 35 deletions
|
@ -282,6 +282,21 @@ void cpu_thread::operator()()
|
|||
thread_ctrl::set_native_priority(-1);
|
||||
}
|
||||
|
||||
if (id_type() == 2)
|
||||
{
|
||||
// force input/output denormals to zero for SPU threads (FTZ/DAZ)
|
||||
_mm_setcsr( _mm_getcsr() | 0x8040 );
|
||||
|
||||
volatile u32 a = 0x1fc00000;
|
||||
__m128 b = _mm_castsi128_ps(_mm_set1_epi32(const_cast<u32 &>(a)));
|
||||
int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b)));
|
||||
|
||||
if (c != 0)
|
||||
{
|
||||
LOG_FATAL(GENERAL,"could not disable denormals");
|
||||
}
|
||||
}
|
||||
|
||||
if (id_type() == 1 && false)
|
||||
{
|
||||
g_fxo->get<cpu_profiler>()->registered.push(id);
|
||||
|
|
|
@ -7186,6 +7186,32 @@ public:
|
|||
set_vr(op.rt, -(get_vr<f64[2]>(op.ra) * get_vr<f64[2]>(op.rb) + get_vr<f64[2]>(op.rt)));
|
||||
}
|
||||
|
||||
// clamping helpers
|
||||
value_t<f32[4]> clamp_positive_smax(value_t<f32[4]> v)
|
||||
{
|
||||
return eval(bitcast<f32[4]>(min(bitcast<s32[4]>(v),splat<s32[4]>(0x7f7fffff))));
|
||||
}
|
||||
|
||||
value_t<f32[4]> clamp_negative_smax(value_t<f32[4]> v)
|
||||
{
|
||||
return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
|
||||
}
|
||||
|
||||
value_t<f32[4]> clamp_smax(value_t<f32[4]> v)
|
||||
{
|
||||
return eval(clamp_negative_smax(clamp_positive_smax(v)));
|
||||
}
|
||||
|
||||
// FMA favouring zeros
|
||||
value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
|
||||
{
|
||||
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
|
||||
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
|
||||
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
|
||||
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
|
||||
return eval(fmuladd(ca, cb, c));
|
||||
}
|
||||
|
||||
void FREST(spu_opcode_t op)
|
||||
{
|
||||
// TODO
|
||||
|
@ -7215,17 +7241,11 @@ public:
|
|||
const auto a = get_vr<f32[4]>(op.ra);
|
||||
const auto b = get_vr<f32[4]>(op.rb);
|
||||
|
||||
// See FCMGT.
|
||||
if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
const auto ia = bitcast<s32[4]>(fabs(a));
|
||||
const auto ib = bitcast<s32[4]>(fabs(b));
|
||||
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
|
||||
|
||||
// Use sign bits to invert abs values before comparison.
|
||||
const auto ca = eval(ia ^ (bitcast<s32[4]>(a) >> 31));
|
||||
const auto cb = eval(ib ^ (bitcast<s32[4]>(b) >> 31));
|
||||
set_vr(op.rt, sext<s32[4]>((ca > cb) & nz));
|
||||
const auto ca = eval(clamp_positive_smax(a));
|
||||
const auto cb = eval(clamp_negative_smax(b));
|
||||
set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > cb)));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -7241,23 +7261,17 @@ public:
|
|||
return;
|
||||
}
|
||||
|
||||
const auto a = get_vr<f32[4]>(op.ra);
|
||||
const auto b = get_vr<f32[4]>(op.rb);
|
||||
const auto abs_a = fabs(a);
|
||||
const auto abs_b = fabs(b);
|
||||
const auto a = eval(fabs(get_vr<f32[4]>(op.ra)));
|
||||
const auto b = eval(fabs(get_vr<f32[4]>(op.rb)));
|
||||
|
||||
// Actually, it's accurate and can be used as an alternative path for accurate xfloat.
|
||||
if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
// Compare abs values as integers, but return false if both are denormals or zeros.
|
||||
const auto ia = bitcast<s32[4]>(abs_a);
|
||||
const auto ib = bitcast<s32[4]>(abs_b);
|
||||
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
|
||||
set_vr(op.rt, sext<s32[4]>((ia > ib) & nz));
|
||||
const auto ca = eval(clamp_positive_smax(a));
|
||||
set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > b)));
|
||||
}
|
||||
else
|
||||
{
|
||||
set_vr(op.rt, sext<s32[4]>(fcmp_ord(abs_a > abs_b)));
|
||||
set_vr(op.rt, sext<s32[4]>(fcmp_ord(a > b)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7273,6 +7287,11 @@ public:
|
|||
{
|
||||
if (g_cfg.core.spu_accurate_xfloat)
|
||||
set_vr(op.rt, get_vr<f64[4]>(op.ra) - get_vr<f64[4]>(op.rb));
|
||||
else if (g_cfg.core.spu_approx_xfloat)
|
||||
{
|
||||
const auto b = eval(clamp_smax(get_vr<f32[4]>(op.rb))); // for #4478
|
||||
set_vr(op.rt, get_vr<f32[4]>(op.ra) - b);
|
||||
}
|
||||
else
|
||||
set_vr(op.rt, get_vr<f32[4]>(op.ra) - get_vr<f32[4]>(op.rb));
|
||||
}
|
||||
|
@ -7285,21 +7304,11 @@ public:
|
|||
{
|
||||
const auto a = get_vr<f32[4]>(op.ra);
|
||||
const auto b = get_vr<f32[4]>(op.rb);
|
||||
const auto m = eval(a * b);
|
||||
const auto abs_a = bitcast<s32[4]>(fabs(a));
|
||||
const auto abs_b = bitcast<s32[4]>(fabs(b));
|
||||
const auto abs_m = bitcast<s32[4]>(fabs(m));
|
||||
const auto sign_a = eval(bitcast<s32[4]>(a) & 0x80000000);
|
||||
const auto sign_b = eval(bitcast<s32[4]>(b) & 0x80000000);
|
||||
const auto smod_m = eval(bitcast<s32[4]>(m) & 0x7fffffff);
|
||||
const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff);
|
||||
const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff));
|
||||
|
||||
// If m produces Inf or NaN, flush it to max xfloat with appropriate sign
|
||||
const auto clamp = select(smod_m > 0x7f7fffff, bitcast<f32[4]>(fmax_m), m);
|
||||
|
||||
// If a, b, or a * b is a denorm or zero, return zero
|
||||
set_vr(op.rt, select(nzero, clamp, fsplat<f32[4]>(0.)));
|
||||
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
|
||||
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
|
||||
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
|
||||
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
|
||||
set_vr(op.rt, ca * cb);
|
||||
}
|
||||
else
|
||||
set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
|
||||
|
|
Loading…
Add table
Reference in a new issue