diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp
index 254ada8636..35234b063b 100644
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@@ -282,6 +282,21 @@ void cpu_thread::operator()()
 		thread_ctrl::set_native_priority(-1);
 	}
 
+	if (id_type() == 2)
+	{
+		// force input/output denormals to zero for SPU threads (FTZ/DAZ)
+		_mm_setcsr( _mm_getcsr() | 0x8040 );
+
+		volatile u32 a = 0x1fc00000;
+		__m128 b = _mm_castsi128_ps(_mm_set1_epi32(const_cast<u32 &>(a)));
+		int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b)));
+
+		if (c != 0)
+		{
+			LOG_FATAL(GENERAL,"could not disable denormals");
+		}
+	}
+
 	if (id_type() == 1 && false)
 	{
 		g_fxo->get<cpu_profiler>()->registered.push(id);
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index 9567b914e5..71fe86c19f 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -7186,6 +7186,32 @@ public:
 		set_vr(op.rt, -(get_vr<f64[2]>(op.ra) * get_vr<f64[2]>(op.rb) + get_vr<f64[2]>(op.rt)));
 	}
 
+	// clamping helpers
+	value_t<f32[4]> clamp_positive_smax(value_t<f32[4]> v)
+	{
+		return eval(bitcast<f32[4]>(min(bitcast<s32[4]>(v),splat<s32[4]>(0x7f7fffff))));
+	}
+
+	value_t<f32[4]> clamp_negative_smax(value_t<f32[4]> v)
+	{
+		return eval(bitcast<f32[4]>(min(bitcast<u32[4]>(v),splat<u32[4]>(0xff7fffff))));
+	}
+
+	value_t<f32[4]> clamp_smax(value_t<f32[4]> v)
+	{
+		return eval(clamp_negative_smax(clamp_positive_smax(v)));
+	}
+
+	// FMA favouring zeros
+	value_t<f32[4]> xmuladd(value_t<f32[4]> a, value_t<f32[4]> b, value_t<f32[4]> c)
+	{
+		const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
+		const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
+		const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
+		const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
+		return eval(fmuladd(ca, cb, c));
+	}
+
 	void FREST(spu_opcode_t op)
 	{
 		// TODO
@@ -7215,17 +7241,11 @@ public:
 		const auto a = get_vr<f32[4]>(op.ra);
 		const auto b = get_vr<f32[4]>(op.rb);
 
-		// See FCMGT.
 		if (g_cfg.core.spu_approx_xfloat)
 		{
-			const auto ia = bitcast<s32[4]>(fabs(a));
-			const auto ib = bitcast<s32[4]>(fabs(b));
-			const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
-
-			// Use sign bits to invert abs values before comparison.
-			const auto ca = eval(ia ^ (bitcast<s32[4]>(a) >> 31));
-			const auto cb = eval(ib ^ (bitcast<s32[4]>(b) >> 31));
-			set_vr(op.rt, sext<s32[4]>((ca > cb) & nz));
+			const auto ca = eval(clamp_positive_smax(a));
+			const auto cb = eval(clamp_negative_smax(b));
+			set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > cb)));
 		}
 		else
 		{
@@ -7241,23 +7261,17 @@ public:
 			return;
 		}
 
-		const auto a = get_vr<f32[4]>(op.ra);
-		const auto b = get_vr<f32[4]>(op.rb);
-		const auto abs_a = fabs(a);
-		const auto abs_b = fabs(b);
+		const auto a = eval(fabs(get_vr<f32[4]>(op.ra)));
+		const auto b = eval(fabs(get_vr<f32[4]>(op.rb)));
 
-		// Actually, it's accurate and can be used as an alternative path for accurate xfloat.
 		if (g_cfg.core.spu_approx_xfloat)
 		{
-			// Compare abs values as integers, but return false if both are denormals or zeros.
-			const auto ia = bitcast<s32[4]>(abs_a);
-			const auto ib = bitcast<s32[4]>(abs_b);
-			const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
-			set_vr(op.rt, sext<s32[4]>((ia > ib) & nz));
+			const auto ca = eval(clamp_positive_smax(a));
+			set_vr(op.rt, sext<s32[4]>(fcmp_ord(ca > b)));
 		}
 		else
 		{
-			set_vr(op.rt, sext<s32[4]>(fcmp_ord(abs_a > abs_b)));
+			set_vr(op.rt, sext<s32[4]>(fcmp_ord(a > b)));
 		}
 	}
 
@@ -7273,6 +7287,11 @@ public:
 	{
 		if (g_cfg.core.spu_accurate_xfloat)
 			set_vr(op.rt, get_vr<f64[4]>(op.ra) - get_vr<f64[4]>(op.rb));
+		else if (g_cfg.core.spu_approx_xfloat)
+		{
+			const auto b = eval(clamp_smax(get_vr<f32[4]>(op.rb))); // for #4478
+			set_vr(op.rt, get_vr<f32[4]>(op.ra) - b);
+		}
 		else
 			set_vr(op.rt, get_vr<f32[4]>(op.ra) - get_vr<f32[4]>(op.rb));
 	}
@@ -7285,21 +7304,11 @@ public:
 		{
 			const auto a = get_vr<f32[4]>(op.ra);
 			const auto b = get_vr<f32[4]>(op.rb);
-			const auto m = eval(a * b);
-			const auto abs_a = bitcast<s32[4]>(fabs(a));
-			const auto abs_b = bitcast<s32[4]>(fabs(b));
-			const auto abs_m = bitcast<s32[4]>(fabs(m));
-			const auto sign_a = eval(bitcast<s32[4]>(a) & 0x80000000);
-			const auto sign_b = eval(bitcast<s32[4]>(b) & 0x80000000);
-			const auto smod_m = eval(bitcast<s32[4]>(m) & 0x7fffffff);
-			const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff);
-			const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff));
-
-			// If m produces Inf or NaN, flush it to max xfloat with appropriate sign
-			const auto clamp = select(smod_m > 0x7f7fffff, bitcast<f32[4]>(fmax_m), m);
-
-			// If a, b, or a * b is a denorm or zero, return zero
-			set_vr(op.rt, select(nzero, clamp, fsplat<f32[4]>(0.)));
+			const auto ma = eval(sext<s32[4]>(fcmp_uno(a != fsplat<f32[4]>(0.))));
+			const auto mb = eval(sext<s32[4]>(fcmp_uno(b != fsplat<f32[4]>(0.))));
+			const auto ca = eval(bitcast<f32[4]>(bitcast<s32[4]>(a) & mb));
+			const auto cb = eval(bitcast<f32[4]>(bitcast<s32[4]>(b) & ma));
+			set_vr(op.rt, ca * cb);
 		}
 		else
 			set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));