diff --git a/Utilities/BEType.h b/Utilities/BEType.h index 9f4c96b5de..806efcbdb2 100644 --- a/Utilities/BEType.h +++ b/Utilities/BEType.h @@ -354,6 +354,16 @@ union _CRT_ALIGN(16) u128 return from64(~_u64[0], ~_u64[1]); } + __forceinline bool test() const + { + return _u64[0] || _u64[1]; + } + + __forceinline bool inv_test() const + { + return ~_u64[0] || ~_u64[1]; + } + // result = (~left) & (right) static __forceinline u128 andnot(const u128& left, const u128& right) { diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index b12a22050e..887266218f 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -225,32 +225,14 @@ void ppu_interpreter::VCMPBFP(PPUThread& CPU, ppu_opcode_t op) const auto b = CPU.VPR[op.vb].vf; const auto sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); const auto bneg = _mm_xor_ps(b, sign); - CPU.VPR[op.vd].vf = _mm_or_ps(_mm_and_ps(_mm_cmple_ps(a, b), sign), _mm_and_ps(_mm_cmpnlt_ps(a, bneg), _mm_castsi128_ps(_mm_set1_epi32(0x40000000)))); + CPU.VPR[op.vd].vf = _mm_or_ps(_mm_and_ps(_mm_cmple_ps(a, b), sign), _mm_and_ps(_mm_cmpge_ps(a, bneg), _mm_castsi128_ps(_mm_set1_epi32(0x40000000)))); } void ppu_interpreter::VCMPBFP_(PPUThread& CPU, ppu_opcode_t op) { - bool allInBounds = true; + VCMPBFP(CPU, op); - for (uint w = 0; w < 4; w++) - { - u32 mask = 1 << 31 | 1 << 30; - - const float a = CPU.VPR[op.va]._f[w]; - const float b = CPU.VPR[op.vb]._f[w]; - - if (a <= b) mask &= ~(1 << 31); - if (a >= -b) mask &= ~(1 << 30); - - CPU.VPR[op.vd]._u32[w] = mask; - - if (mask) - allInBounds = false; - } - - // Bit n°2 of CR6 - CPU.SetCR(6, 0); - CPU.SetCRBit(6, 0x2, allInBounds); + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? 0 : 2; // set 2 if all in bounds } void ppu_interpreter::VCMPEQFP(PPUThread& CPU, ppu_opcode_t op) @@ -260,24 +242,9 @@ void ppu_interpreter::VCMPEQFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQFP_(PPUThread& CPU, ppu_opcode_t op) { - int all_equal = 0x8; - int none_equal = 0x2; + VCMPEQFP(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._f[w] == CPU.VPR[op.vb]._f[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_equal = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_equal = 0; - } - } - - CPU.CR.cr6 = all_equal | none_equal; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal } void ppu_interpreter::VCMPEQUB(PPUThread& CPU, ppu_opcode_t op) @@ -287,24 +254,9 @@ void ppu_interpreter::VCMPEQUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUB_(PPUThread& CPU, ppu_opcode_t op) { - int all_equal = 0x8; - int none_equal = 0x2; + VCMPEQUB(CPU, op); - for (uint b = 0; b < 16; b++) - { - if (CPU.VPR[op.va]._u8[b] == CPU.VPR[op.vb]._u8[b]) - { - CPU.VPR[op.vd]._u8[b] = 0xff; - none_equal = 0; - } - else - { - CPU.VPR[op.vd]._u8[b] = 0; - all_equal = 0; - } - } - - CPU.CR.cr6 = all_equal | none_equal; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal } void ppu_interpreter::VCMPEQUH(PPUThread& CPU, ppu_opcode_t op) @@ -314,24 +266,9 @@ void ppu_interpreter::VCMPEQUH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUH_(PPUThread& CPU, ppu_opcode_t op) { - int all_equal = 0x8; - int none_equal = 0x2; + VCMPEQUH(CPU, op); - for (uint h = 0; h < 8; h++) - { - if (CPU.VPR[op.va]._u16[h] == CPU.VPR[op.vb]._u16[h]) - { - CPU.VPR[op.vd]._u16[h] = 0xffff; - none_equal = 0; - } - else - { - CPU.VPR[op.vd]._u16[h] = 0; - all_equal = 0; - } - } - - CPU.CR.cr6 = all_equal | none_equal; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal } void ppu_interpreter::VCMPEQUW(PPUThread& CPU, ppu_opcode_t op) @@ -341,24 +278,9 @@ void ppu_interpreter::VCMPEQUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPEQUW_(PPUThread& CPU, ppu_opcode_t op) { - int all_equal = 0x8; - int none_equal = 0x2; + VCMPEQUW(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._u32[w] == CPU.VPR[op.vb]._u32[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_equal = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_equal = 0; - } - } - - CPU.CR.cr6 = all_equal | none_equal; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; // set 2 if none equal, 8 if all equal } void ppu_interpreter::VCMPGEFP(PPUThread& CPU, ppu_opcode_t op) @@ -368,24 +290,9 @@ void ppu_interpreter::VCMPGEFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGEFP_(PPUThread& CPU, ppu_opcode_t op) { - int all_ge = 0x8; - int none_ge = 0x2; + VCMPGEFP(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._f[w] >= CPU.VPR[op.vb]._f[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_ge = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_ge = 0; - } - } - - CPU.CR.cr6 = all_ge | none_ge; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTFP(PPUThread& CPU, ppu_opcode_t op) @@ -395,24 +302,9 @@ void ppu_interpreter::VCMPGTFP(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTFP_(PPUThread& CPU, ppu_opcode_t op) { - int all_ge = 0x8; - int none_ge = 0x2; + VCMPGTFP(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._f[w] > CPU.VPR[op.vb]._f[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_ge = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_ge = 0; - } - } - - CPU.CR.cr6 = all_ge | none_ge; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTSB(PPUThread& CPU, ppu_opcode_t op) @@ -422,24 +314,9 @@ void ppu_interpreter::VCMPGTSB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSB_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTSB(CPU, op); - for (uint b = 0; b < 16; b++) - { - if (CPU.VPR[op.va]._s8[b] > CPU.VPR[op.vb]._s8[b]) - { - CPU.VPR[op.vd]._u8[b] = 0xff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u8[b] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTSH(PPUThread& CPU, ppu_opcode_t op) @@ -449,24 +326,9 @@ void ppu_interpreter::VCMPGTSH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSH_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTSH(CPU, op); - for (uint h = 0; h < 8; h++) - { - if (CPU.VPR[op.va]._s16[h] > CPU.VPR[op.vb]._s16[h]) - { - CPU.VPR[op.vd]._u16[h] = 0xffff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u16[h] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTSW(PPUThread& CPU, ppu_opcode_t op) @@ -476,24 +338,9 @@ void ppu_interpreter::VCMPGTSW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTSW_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTSW(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._s32[w] > CPU.VPR[op.vb]._s32[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTUB(PPUThread& CPU, ppu_opcode_t op) @@ -503,24 +350,9 @@ void ppu_interpreter::VCMPGTUB(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUB_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTUB(CPU, op); - for (uint b = 0; b < 16; b++) - { - if (CPU.VPR[op.va]._u8[b] > CPU.VPR[op.vb]._u8[b]) - { - CPU.VPR[op.vd]._u8[b] = 0xff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u8[b] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTUH(PPUThread& CPU, ppu_opcode_t op) @@ -530,24 +362,9 @@ void ppu_interpreter::VCMPGTUH(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUH_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTUH(CPU, op); - for (uint h = 0; h < 8; h++) - { - if (CPU.VPR[op.va]._u16[h] > CPU.VPR[op.vb]._u16[h]) - { - CPU.VPR[op.vd]._u16[h] = 0xffff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u16[h] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCMPGTUW(PPUThread& CPU, ppu_opcode_t op) @@ -557,24 +374,9 @@ void ppu_interpreter::VCMPGTUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VCMPGTUW_(PPUThread& CPU, ppu_opcode_t op) { - int all_gt = 0x8; - int none_gt = 0x2; + VCMPGTUW(CPU, op); - for (uint w = 0; w < 4; w++) - { - if (CPU.VPR[op.va]._u32[w] > CPU.VPR[op.vb]._u32[w]) - { - CPU.VPR[op.vd]._u32[w] = 0xffffffff; - none_gt = 0; - } - else - { - CPU.VPR[op.vd]._u32[w] = 0; - all_gt = 0; - } - } - - CPU.CR.cr6 = all_gt | none_gt; + CPU.CR.cr6 = CPU.VPR[op.vd].test() ? (CPU.VPR[op.vd].inv_test() ? 0 : 8) : 2; } void ppu_interpreter::VCTSXS(PPUThread& CPU, ppu_opcode_t op) @@ -652,42 +454,14 @@ void ppu_interpreter::VMAXUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMHADDSHS(PPUThread& CPU, ppu_opcode_t op) { - for (uint h = 0; h < 8; h++) - { - s32 result = (s32)CPU.VPR[op.va]._s16[h] * (s32)CPU.VPR[op.vb]._s16[h]; - result = (result >> 15) + (s32)CPU.VPR[op.vc]._s16[h]; - - if (result > INT16_MAX) - { - CPU.VPR[op.vd]._s16[h] = (s16)INT16_MAX; - } - else if (result < INT16_MIN) - { - CPU.VPR[op.vd]._s16[h] = (s16)INT16_MIN; - } - else - CPU.VPR[op.vd]._s16[h] = (s16)result; - } + const auto a = CPU.VPR[op.va].vi; + const auto b = CPU.VPR[op.vb].vi; + CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_or_si128(_mm_srli_epi16(_mm_mullo_epi16(a, b), 15), _mm_slli_epi16(_mm_mulhi_epi16(a, b), 1)), CPU.VPR[op.vc].vi); } void ppu_interpreter::VMHRADDSHS(PPUThread& CPU, ppu_opcode_t op) { - for (uint h = 0; h < 8; h++) - { - s32 result = ((s32)CPU.VPR[op.va]._s16[h] * (s32)CPU.VPR[op.vb]._s16[h]) + 0x4000; - result = (result >> 15) + (s32)CPU.VPR[op.vc]._s16[h]; - - if (result > INT16_MAX) - { - CPU.VPR[op.vd]._s16[h] = (s16)INT16_MAX; - } - else if (result < INT16_MIN) - { - CPU.VPR[op.vd]._s16[h] = (s16)INT16_MIN; - } - else - CPU.VPR[op.vd]._s16[h] = (s16)result; - } + CPU.VPR[op.vd].vi = _mm_adds_epi16(_mm_mulhrs_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi), CPU.VPR[op.vc].vi); } void ppu_interpreter::VMINFP(PPUThread& CPU, ppu_opcode_t op) @@ -737,10 +511,7 @@ void ppu_interpreter::VMINUW(PPUThread& CPU, ppu_opcode_t op) void ppu_interpreter::VMLADDUHM(PPUThread& CPU, ppu_opcode_t op) { - for (uint h = 0; h < 8; h++) - { - CPU.VPR[op.vd]._u16[h] = CPU.VPR[op.va]._u16[h] * CPU.VPR[op.vb]._u16[h] + CPU.VPR[op.vc]._u16[h]; - } + CPU.VPR[op.vd].vi = _mm_add_epi16(_mm_mullo_epi16(CPU.VPR[op.va].vi, CPU.VPR[op.vb].vi), CPU.VPR[op.vc].vi); } void ppu_interpreter::VMRGHB(PPUThread& CPU, ppu_opcode_t op)