From f02539e59dc04fc654fd1b4d1bc839f47f142b42 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 23 Aug 2015 21:46:36 -0300
Subject: [PATCH 1/6] Shader JIT: Fix CMP NaN behavior to match hardware

---
 src/video_core/shader/shader_jit_x64.cpp | 31 ++++++++++++++++++------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 6865c64e35..2a1e510133 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -578,27 +578,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
 }
 
 void JitCompiler::Compile_CMP(Instruction instr) {
+    using Op = Instruction::Common::CompareOpType::Op;
+    Op op_x = instr.common.compare_op.x;
+    Op op_y = instr.common.compare_op.y;
+
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+    // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+    // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+    // because they don't match when used with NaNs.
+    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
 
-    if (instr.common.compare_op.x == instr.common.compare_op.y) {
+    bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+    Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+    Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
+
+    if (op_x == op_y) {
         // Compare X-component and Y-component together
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+        CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+        MOVQ_xmm(R(COND0), lhs_x);
 
-        MOVQ_xmm(R(COND0), SRC1);
         MOV(64, R(COND1), R(COND0));
     } else {
+        bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+        Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+        Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
         // Compare X-component
-        MOVAPS(SCRATCH, R(SRC1));
-        CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+        MOVAPS(SCRATCH, R(lhs_x));
+        CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
 
         // Compare Y-component
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+        CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
 
         MOVQ_xmm(R(COND0), SCRATCH);
-        MOVQ_xmm(R(COND1), SRC1);
+        MOVQ_xmm(R(COND1), lhs_y);
     }
 
     SHR(32, R(COND0), Imm8(31));

From c0959ca635308c54f178e7b58e36fc887980f215 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:10 -0300
Subject: [PATCH 2/6] Shader JIT: Add name to second scratch register (XMM4)

---
 src/video_core/shader/shader_jit_x64.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 2a1e510133..e56bff2b8a 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
 static const X64Reg SRC2 = XMM2;
 /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
 static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
 /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
 static const X64Reg ONE = XMM14;
 /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
             BLENDPS(SCRATCH, R(src), mask);
         } else {
-            MOVAPS(XMM4, R(src));
-            UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+            MOVAPS(SCRATCH2, R(src));
+            UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
             UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
 
             // Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
                      ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
                      ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
                      ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
-            SHUFPS(SCRATCH, R(XMM4), sel);
+            SHUFPS(SCRATCH, R(SCRATCH2), sel);
         }
 
         // Store dest back to memory

From 8b0a7e7afe339d6186d1ca2c22a105b53ac4b4a8 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:46:58 -0300
Subject: [PATCH 3/6] Shaders: Explicitly conform to PICA semantics in MAX/MIN

---
 src/video_core/shader/shader_interpreter.cpp | 10 ++++++++--
 src/video_core/shader/shader_jit_x64.cpp     |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index ae5a304419..69e4efa689 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::max(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   max(0, NaN) -> NaN
+                    //   max(NaN, 0) -> 0
+                    dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
@@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::min(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   min(0, NaN) -> NaN
+                    //   min(NaN, 0) -> 0
+                    dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index e56bff2b8a..456c8567d5 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -467,6 +467,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
 void JitCompiler::Compile_MAX(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MAXPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
@@ -474,6 +475,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
 void JitCompiler::Compile_MIN(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MINPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }

From 9a4a0cc8e052d1e1b0339b88818d2a9daeb75612 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:48:15 -0300
Subject: [PATCH 4/6] Shaders: Fix multiplications between 0.0 and inf

The PICA200 semantics for multiplication are so that when multiplying
inf by exactly 0.0, the result is 0.0, instead of NaN, as defined by
IEEE. This is relied upon by games.

Fixes #1024 (missing OoT interface items)
---
 src/video_core/pica.h                    | 14 +++-
 src/video_core/shader/shader_jit_x64.cpp | 82 ++++++++++++------------
 src/video_core/shader/shader_jit_x64.h   |  6 ++
 3 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 58b924f9e1..cf148de500 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1021,12 +1021,20 @@ struct float24 {
         return ret;
     }
 
+    static float24 Zero() {
+        return FromFloat32(0.f);
+    }
+
     // Not recommended for anything but logging
     float ToFloat32() const {
         return value;
     }
 
     float24 operator * (const float24& flt) const {
+        if ((this->value == 0.f && flt.value == flt.value) ||
+            (flt.value == 0.f && this->value == this->value))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            return Zero();
         return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
     }
 
@@ -1043,7 +1051,11 @@ struct float24 {
     }
 
     float24& operator *= (const float24& flt) {
-        value *= flt.ToFloat32();
+        if ((this->value == 0.f && flt.value == flt.value) ||
+            (flt.value == 0.f && this->value == this->value))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            *this = Zero();
+        else value *= flt.ToFloat32();
         return *this;
     }
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 456c8567d5..ddae61caeb 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -246,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
     }
 }
 
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+    MOVAPS(scratch, R(src1));
+    CMPPS(scratch, R(src2), CMP_ORD);
+
+    MULPS(src1, R(src2));
+
+    MOVAPS(src2, R(src1));
+    CMPPS(src2, R(src2), CMP_UNORD);
+
+    XORPS(scratch, R(src2));
+    ANDPS(src1, R(scratch));
+}
+
 void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
     // Note: NXOR is used below to check for equality
     switch (instr.flow_control.op) {
@@ -309,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0x7f);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
 
-        MOVAPS(SRC3, R(SRC1));
-        SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+    MOVAPS(SRC3, R(SRC1));
+    SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
 
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
-        ADDPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+    ADDPS(SRC1, R(SRC2));
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -332,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0xff);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -361,24 +366,23 @@ void JitCompiler::Compile_DPH(Instruction instr) {
     if (Common::GetCPUCaps().sse4_1) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
-        DPPS(SRC1, R(SRC2), 0xff);
     } else {
         // Reverse to set the 4th component to 1.0
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
         MOVSS(SRC1, R(ONE));
         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
-
-        MULPS(SRC1, R(SRC2));
-
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
-
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
     }
 
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
+
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
+
     Compile_DestEnable(instr, SRC1);
 }
 
@@ -417,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
 void JitCompiler::Compile_MUL(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-    MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
     Compile_DestEnable(instr, SRC1);
 }
 
@@ -635,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
         Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
     }
 
-    if (Common::GetCPUCaps().fma) {
-        VFMADD213PS(SRC1, SRC2, R(SRC3));
-    } else {
-        MULPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index fbe19fe933..58828ecc8c 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -68,6 +68,12 @@ private:
     void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
     void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
 
+    /**
+     * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+     * zero by inf. Clobbers `src2` and `scratch`.
+     */
+    void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
+
     void Compile_EvaluateCondition(Instruction instr);
     void Compile_UniformCondition(Instruction instr);
 

From 9023d7a1d467d7cfb50e1a2b165313f945512b27 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 01:15:39 -0300
Subject: [PATCH 5/6] Shader JIT: Tiny micro-optimization in DPH

---
 src/video_core/shader/shader_jit_x64.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ddae61caeb..c8a669b517 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -367,10 +367,10 @@ void JitCompiler::Compile_DPH(Instruction instr) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
     } else {
-        // Reverse to set the 4th component to 1.0
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
-        MOVSS(SRC1, R(ONE));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+        // Set 4th component to 1.0
+        MOVAPS(SCRATCH, R(SRC1));
+        UNPCKHPS(SCRATCH, R(ONE));  // XYZW, 1111 -> Z1__
+        UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
     }
 
     Compile_SanitizedMul(SRC1, SRC2, SCRATCH);

From 08e13a10f7558aa0bad99c539bf6eca875deec7c Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Mon, 24 Aug 2015 02:10:11 -0300
Subject: [PATCH 6/6] fixup! Shaders: Fix multiplications between 0.0 and inf

---
 src/video_core/pica.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index cf148de500..bb689f2a99 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1031,8 +1031,8 @@ struct float24 {
     }
 
     float24 operator * (const float24& flt) const {
-        if ((this->value == 0.f && flt.value == flt.value) ||
-            (flt.value == 0.f && this->value == this->value))
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
             // PICA gives 0 instead of NaN when multiplying by inf
             return Zero();
         return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
@@ -1051,8 +1051,8 @@ struct float24 {
     }
 
     float24& operator *= (const float24& flt) {
-        if ((this->value == 0.f && flt.value == flt.value) ||
-            (flt.value == 0.f && this->value == this->value))
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
             // PICA gives 0 instead of NaN when multiplying by inf
             *this = Zero();
         else value *= flt.ToFloat32();