Improve approximate xfloat

- Disable denormals for SPU threads
- Add clamping helpers
This commit is contained in:
Markus Stockhausen 2019-10-26 22:51:38 +02:00 committed by Nekotekina
parent 3b46c9cb6a
commit a36f0497ce
2 changed files with 59 additions and 35 deletions

View file

@ -282,6 +282,21 @@ void cpu_thread::operator()()
thread_ctrl::set_native_priority(-1);
}
if (id_type() == 2)
{
// force input/output denormals to zero for SPU threads (FTZ/DAZ)
_mm_setcsr( _mm_getcsr() | 0x8040 );
volatile u32 a = 0x1fc00000;
__m128 b = _mm_castsi128_ps(_mm_set1_epi32(const_cast<u32 &>(a)));
int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b)));
if (c != 0)
{
LOG_FATAL(GENERAL,"could not disable denormals");
}
}
if (id_type() == 1 && false)
{
g_fxo->get<cpu_profiler>()->registered.push(id);

View file

@ -7186,6 +7186,32 @@ public:
set_vr(op.rt, -(get_vr<f64[2]>(op.ra) * get_vr<f64[2]>(op.rb) + get_vr<f64[2]>(op.rt)));
}
// clamping helpers
value_t<f32[4]> clamp_positive_smax(value_t<f32[4]> v)
{
return eval(bitcast<f32[4]>(min(bitcast<s32[4]>(v),splat<s32[4]>(0x7f7fffff))));
}
value_t<f32[4]> clamp_negative_smax(value_t<f32[4]> v)
{
return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
}
value_t<f32[4]> clamp_smax(value_t<f32[4]> v)
{
return eval(clamp_negative_smax(clamp_positive_smax(v)));
}
// FMA favouring zeros
value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
{
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
return eval(fmuladd(ca, cb, c));
}
void FREST(spu_opcode_t op)
{
// TODO
@ -7215,17 +7241,11 @@ public:
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
// See FCMGT.
if (g_cfg.core.spu_approx_xfloat)
{
const auto ia = bitcast<s32[4]>(fabs(a));
const auto ib = bitcast<s32[4]>(fabs(b));
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
// Use sign bits to invert abs values before comparison.
const auto ca = eval(ia ^ (bitcast<s32[4]>(a) >> 31));
const auto cb = eval(ib ^ (bitcast<s32[4]>(b) >> 31));
set_vr(op.rt, sext<s32[4]>((ca > cb) & nz));
const auto ca = eval(clamp_positive_smax(a));
const auto cb = eval(clamp_negative_smax(b));
set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > cb)));
}
else
{
@ -7241,23 +7261,17 @@ public:
return;
}
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
const auto abs_a = fabs(a);
const auto abs_b = fabs(b);
const auto a = eval(fabs(get_vr<f32[4]>(op.ra)));
const auto b = eval(fabs(get_vr<f32[4]>(op.rb)));
// Actually, it's accurate and can be used as an alternative path for accurate xfloat.
if (g_cfg.core.spu_approx_xfloat)
{
// Compare abs values as integers, but return false if both are denormals or zeros.
const auto ia = bitcast<s32[4]>(abs_a);
const auto ib = bitcast<s32[4]>(abs_b);
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
set_vr(op.rt, sext<s32[4]>((ia > ib) & nz));
const auto ca = eval(clamp_positive_smax(a));
set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > b)));
}
else
{
set_vr(op.rt, sext<s32[4]>(fcmp_ord(abs_a > abs_b)));
set_vr(op.rt, sext<s32[4]>(fcmp_ord(a > b)));
}
}
@ -7273,6 +7287,11 @@ public:
{
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.ra) - get_vr<f64[4]>(op.rb));
else if (g_cfg.core.spu_approx_xfloat)
{
const auto b = eval(clamp_smax(get_vr<f32[4]>(op.rb))); // for #4478
set_vr(op.rt, get_vr<f32[4]>(op.ra) - b);
}
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) - get_vr<f32[4]>(op.rb));
}
@ -7285,21 +7304,11 @@ public:
{
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
const auto m = eval(a * b);
const auto abs_a = bitcast<s32[4]>(fabs(a));
const auto abs_b = bitcast<s32[4]>(fabs(b));
const auto abs_m = bitcast<s32[4]>(fabs(m));
const auto sign_a = eval(bitcast<s32[4]>(a) & 0x80000000);
const auto sign_b = eval(bitcast<s32[4]>(b) & 0x80000000);
const auto smod_m = eval(bitcast<s32[4]>(m) & 0x7fffffff);
const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff);
const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff));
// If m produces Inf or NaN, flush it to max xfloat with appropriate sign
const auto clamp = select(smod_m > 0x7f7fffff, bitcast<f32[4]>(fmax_m), m);
// If a, b, or a * b is a denorm or zero, return zero
set_vr(op.rt, select(nzero, clamp, fsplat<f32[4]>(0.)));
const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
set_vr(op.rt, ca * cb);
}
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));