diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 87bc5897c9..98a5288f62 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -2682,27 +2682,71 @@ void spu_recompiler::MPYA(spu_opcode_t op) void spu_recompiler::FNMS(spu_opcode_t op) { - const XmmLink& va = XmmGet(op.ra, XmmType::Float); const XmmLink& vc = XmmGet(op.rc, XmmType::Float); - c->mulps(va, SPU_OFF_128(gpr[op.rb])); - c->subps(vc, va); + + const auto mask = XmmConst(_mm_set1_epi32(0x7f800000)); + const XmmLink& tmp_a = XmmAlloc(); + const XmmLink& tmp_b = XmmAlloc(); + + c->movdqa(tmp_a, mask); //tmp_a = mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = a & mask + c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = mask_a & va + + c->movdqa(tmp_b, mask); //tmp_b = mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = b & mask + c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = mask_b & vb + + c->mulps(tmp_a, tmp_b); + c->subps(vc, tmp_a); c->movaps(SPU_OFF_128(gpr[op.rt4]), vc); } void spu_recompiler::FMA(spu_opcode_t op) { - const XmmLink& va = XmmGet(op.ra, XmmType::Float); - c->mulps(va, SPU_OFF_128(gpr[op.rb])); - c->addps(va, SPU_OFF_128(gpr[op.rc])); - c->movaps(SPU_OFF_128(gpr[op.rt4]), va); + const XmmLink& vc = XmmGet(op.rc, XmmType::Float); + + const auto mask = XmmConst(_mm_set1_epi32(0x7f800000)); + const XmmLink& tmp_a = XmmAlloc(); + const XmmLink& tmp_b = XmmAlloc(); + + c->movdqa(tmp_a, mask); //tmp_a = mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = a & mask + c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = mask_a & va + + c->movdqa(tmp_b, mask); //tmp_b = mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = b & mask + c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = mask_b & vb + + c->mulps(tmp_a, tmp_b); + c->addps(tmp_a, SPU_OFF_128(gpr[op.rc])); + c->movaps(SPU_OFF_128(gpr[op.rt4]), tmp_a); } void spu_recompiler::FMS(spu_opcode_t op) { - const XmmLink& va = XmmGet(op.ra, XmmType::Float); - c->mulps(va, SPU_OFF_128(gpr[op.rb])); - c->subps(va, SPU_OFF_128(gpr[op.rc])); - c->movaps(SPU_OFF_128(gpr[op.rt4]), va); + const XmmLink& vc = XmmGet(op.rc, XmmType::Float); + + const auto mask = XmmConst(_mm_set1_epi32(0x7f800000)); + const XmmLink& tmp_a = XmmAlloc(); + const XmmLink& tmp_b = XmmAlloc(); + + c->movdqa(tmp_a, mask); //tmp_a = mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = a & mask + c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask + c->andps(tmp_a, SPU_OFF_128(gpr[op.ra])); //tmp_a = mask_a & va + + c->movdqa(tmp_b, mask); //tmp_b = mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = b & mask + c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask + c->andps(tmp_b, SPU_OFF_128(gpr[op.rb])); //tmp_b = mask_b & vb + + c->mulps(tmp_a, tmp_b); + c->subps(tmp_a, SPU_OFF_128(gpr[op.rc])); + c->movaps(SPU_OFF_128(gpr[op.rt4]), tmp_a); } void spu_recompiler::UNK(spu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 38aa2571ce..fdea67717d 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -1297,17 +1297,50 @@ void spu_interpreter::MPYA(SPUThread& spu, spu_opcode_t op) void spu_interpreter_fast::FNMS(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt4].vf = _mm_sub_ps(spu.gpr[op.rc].vf, _mm_mul_ps(spu.gpr[op.ra].vf, spu.gpr[op.rb].vf)); + const u32 test_bits = 0x7f800000; + auto mask = _mm_set1_ps((f32&)test_bits); + + auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask); + auto mask_a = _mm_cmpneq_ps(test_a, mask); + auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask); + auto mask_b = _mm_cmpneq_ps(test_b, mask); + + auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a); + auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b); + + spu.gpr[op.rt4].vf = _mm_sub_ps(spu.gpr[op.rc].vf, _mm_mul_ps(a, b)); } void spu_interpreter_fast::FMA(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt4].vf = _mm_add_ps(_mm_mul_ps(spu.gpr[op.ra].vf, spu.gpr[op.rb].vf), spu.gpr[op.rc].vf); + const u32 test_bits = 0x7f800000; + auto mask = _mm_set1_ps((f32&)test_bits); + + auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask); + auto mask_a = _mm_cmpneq_ps(test_a, mask); + auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask); + auto mask_b = _mm_cmpneq_ps(test_b, mask); + + auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a); + auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b); + + spu.gpr[op.rt4].vf = _mm_add_ps(_mm_mul_ps(a, b), spu.gpr[op.rc].vf); } void spu_interpreter_fast::FMS(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt4].vf = _mm_sub_ps(_mm_mul_ps(spu.gpr[op.ra].vf, spu.gpr[op.rb].vf), spu.gpr[op.rc].vf); + const u32 test_bits = 0x7f800000; + auto mask = _mm_set1_ps((f32&)test_bits); + + auto test_a = _mm_and_ps(spu.gpr[op.ra].vf, mask); + auto mask_a = _mm_cmpneq_ps(test_a, mask); + auto test_b = _mm_and_ps(spu.gpr[op.rb].vf, mask); + auto mask_b = _mm_cmpneq_ps(test_b, mask); + + auto a = _mm_and_ps(spu.gpr[op.ra].vf, mask_a); + auto b = _mm_and_ps(spu.gpr[op.rb].vf, mask_b); + + spu.gpr[op.rt4].vf = _mm_sub_ps(_mm_mul_ps(a, b), spu.gpr[op.rc].vf); } static void SetHostRoundingMode(u32 rn)