diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index bf6cc54709..bb909d6cf9 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -1878,7 +1878,7 @@ auto VMSUMSHS() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c, auto&& sat) { - auto r = gv_dots_s16x2(std::move(a), std::move(b), std::move(c)); + auto r = gv_dots_s16x2(a, b, c); auto s = gv_dots16x2(std::move(a), std::move(b), std::move(c)); if constexpr (((Flags == set_sat) || ...)) sat = gv_or32(gv_xor32(std::move(s), r), std::move(sat)); @@ -2162,26 +2162,18 @@ auto VPKPX() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - v128 VA = ppu.vr[op.va]; - v128 VB = ppu.vr[op.vb]; - for (uint h = 0; h < 4; h++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - u16 bb7 = VB._u8[15 - (h * 4 + 0)] & 0x1; - u16 bb8 = VB._u8[15 - (h * 4 + 1)] >> 3; - u16 bb16 = VB._u8[15 - (h * 4 + 2)] >> 3; - u16 bb24 = VB._u8[15 - (h * 4 + 3)] >> 3; - u16 ab7 = VA._u8[15 - (h * 4 + 0)] & 0x1; - u16 ab8 = VA._u8[15 - (h * 4 + 1)] >> 3; - u16 ab16 = VA._u8[15 - (h * 4 + 2)] >> 3; - u16 ab24 = VA._u8[15 - (h * 4 + 3)] >> 3; - - d._u16[3 - h] = (bb7 << 15) | (bb8 << 10) | (bb16 << 5) | bb24; - d._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24; - } + auto a1 = gv_sar32(gv_shl32(a, 7), 7 + 9); + auto b1 = gv_sar32(gv_shl32(b, 7), 7 + 9); + auto a2 = gv_sar32(gv_shl32(a, 16), 16 + 3); + auto b2 = gv_sar32(gv_shl32(b, 16), 16 + 3); + auto p1 = gv_packss_s32(b1, a1); + auto p2 = gv_packss_s32(b2, a2); + d = gv_or32(gv_or32(gv_and32(p1, gv_bcst16(0xfc00)), gv_shl16(gv_and32(p1, gv_bcst16(0x7c)), 3)), gv_and32(p2, gv_bcst16(0x1f))); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2190,14 +2182,14 @@ auto VPKSHSS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - ppu.vr[op.vd] = _mm_packs_epi16(b, a); - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr16(gv_add16(a, gv_bcst16(0x80)) | gv_add16(b, gv_bcst16(0x80)), 8); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr16(gv_add16(a, gv_bcst16(0x80)) | gv_add16(b, gv_bcst16(0x80)), 8), std::move(sat)); + d = gv_packss_s16(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2206,14 +2198,14 @@ auto VPKSHUS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - ppu.vr[op.vd] = _mm_packus_epi16(b, a); - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr16(a | b, 8); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr16(a | b, 8), std::move(sat)); + d = gv_packus_s16(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2222,14 +2214,14 @@ auto VPKSWSS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - ppu.vr[op.vd] = _mm_packs_epi32(b, a); - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr32(gv_add32(a, gv_bcst32(0x8000)) | gv_add32(b, gv_bcst32(0x8000)), 16); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr32(gv_add32(a, gv_bcst32(0x8000)) | gv_add32(b, gv_bcst32(0x8000)), 16), std::move(sat)); + d = gv_packss_s32(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2238,20 +2230,14 @@ auto VPKSWUS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; -#if defined(__SSE4_1__) || defined(ARCH_ARM64) - ppu.vr[op.vd] = _mm_packus_epi32(b, a); -#else - const auto s = _mm_srai_epi16(_mm_packs_epi32(b, a), 15); - const auto r = gv_add16(_mm_packs_epi32(gv_sub32(b, gv_bcst32(0x8000)), gv_sub32(a, gv_bcst32(0x8000))), gv_bcst16(0x8000)); - ppu.vr[op.vd] = gv_andn(s, r); -#endif - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr32(a | b, 16); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr32(a | b, 16), std::move(sat)); + d = gv_packus_s32(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2260,12 +2246,12 @@ auto VPKUHUM() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - ppu.vr[op.vd] = _mm_packus_epi16(b & _mm_set1_epi16(0xff), a & _mm_set1_epi16(0xff)); + static const auto exec = [](auto&& d, auto&& a, auto&& b) + { + d = gv_packtu16(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2274,16 +2260,13 @@ auto VPKUHUS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - const v128 s = _mm_cmpeq_epi8(_mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)), _mm_setzero_si128()); - const v128 r = _mm_packus_epi16(b & _mm_set1_epi16(0xff), a & _mm_set1_epi16(0xff)); - ppu.vr[op.vd] = r | ~s; - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr16(a | b, 8); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr16(a | b, 8), std::move(sat)); + d = gv_packus_u16(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2292,17 +2275,12 @@ auto VPKUWUM() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; -#if defined(__SSE4_1__) || defined(ARCH_ARM64) - const auto r = _mm_packus_epi32(b & _mm_set1_epi32(0xffff), a & _mm_set1_epi32(0xffff)); -#else - const auto r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(b, 16), 16), _mm_srai_epi32(_mm_slli_epi32(a, 16), 16)); -#endif - ppu.vr[op.vd] = r; + static const auto exec = [](auto&& d, auto&& a, auto&& b) + { + d = gv_packtu32(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2311,20 +2289,14 @@ auto VPKUWUS() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - const auto a = ppu.vr[op.va]; - const auto b = ppu.vr[op.vb]; - const v128 s = _mm_cmpeq_epi16(_mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), _mm_setzero_si128()); -#if defined(__SSE4_1__) || defined(ARCH_ARM64) - const v128 r = _mm_packus_epi32(b & _mm_set1_epi32(0xffff), a & _mm_set1_epi32(0xffff)); -#else - const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(b, 16), 16), _mm_srai_epi32(_mm_slli_epi32(a, 16), 16)); -#endif - ppu.vr[op.vd] = r | ~s; - if constexpr (((Flags == set_sat) || ...)) - ppu.sat = ppu.sat | gv_shr32(a | b, 16); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) + { + if constexpr (((Flags == set_sat) || ...)) + sat = gv_or32(gv_shr32(a | b, 16), std::move(sat)); + d = gv_packus_u32(std::move(b), std::move(a)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -3170,7 +3142,7 @@ auto VSUM2SWS() r._u64[1] = y._s64[1] > INT32_MAX ? INT32_MAX : y._s64[1] < INT32_MIN ? u32(INT32_MIN) : static_cast(y._s64[1]); #endif if constexpr (((Flags == set_sat) || ...)) - sat = sat | gv_shr64(gv_add64(y, gv_bcst64(0x80000000u)), 32); + sat = gv_or32(gv_shr64(gv_add64(y, gv_bcst64(0x80000000u)), 32), std::move(sat)); d = r; }; @@ -3187,12 +3159,11 @@ auto VSUM4SBS() { //const auto r = _mm_dpbusds_epi32(b, _mm_set1_epi8(1), a); //const auto s = _mm_dpbusd_epi32(b, _mm_set1_epi8(1), a); - const auto x = gv_hadds8x4(a); - const auto r = gv_adds_s32(x, b); - const auto s = gv_add32(x, b); + auto x = gv_hadds8x4(a); + auto r = gv_adds_s32(x, b); if constexpr (((Flags == set_sat) || ...)) - sat = sat | (r ^ s); - d = r; + sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat)); + d = std::move(r); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); @@ -3208,12 +3179,11 @@ auto VSUM4SHS() { //const auto r = _mm_dpwssds_epi32(b, a, _mm_set1_epi16(1)); //const auto s = _mm_dpwssd_epi32(b, a, _mm_set1_epi16(1)); - const auto x = gv_hadds16x2(a); - const auto r = gv_adds_s32(x, b); - const auto s = gv_add32(x, b); + auto x = gv_hadds16x2(a); + auto r = gv_adds_s32(x, b); if constexpr (((Flags == set_sat) || ...)) - sat = sat | (r ^ s); - d = r; + sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat)); + d = std::move(r); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); @@ -3227,11 +3197,11 @@ auto VSUM4UBS() static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& sat) { - const auto x = gv_haddu8x4(a); - const auto r = gv_addus_u32(x, b); + auto x = gv_haddu8x4(a); + auto r = gv_addus_u32(x, b); if constexpr (((Flags == set_sat) || ...)) - sat = sat | (r ^ gv_add32(x, b)); - d = r; + sat = gv_or32(gv_xor32(gv_add32(std::move(x), std::move(b)), r), std::move(sat)); + d = std::move(r); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index 116749512d..36dfde87dd 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -288,6 +288,7 @@ namespace asmjit if constexpr (arg_classify == arg_class::reg_rv) { g_vc->vec_dealloc(vec_type{b.id()}); + //b = Operand(); } } else if (utils::has_avx() && avx_op && (arg_classify == arg_class::reg_lv || arg_classify == arg_class::mem_lv)) @@ -317,6 +318,7 @@ namespace asmjit if constexpr (arg_classify == arg_class::reg_rv) { g_vc->vec_dealloc(vec_type{b.id()}); + //b = Operand(); } } @@ -334,6 +336,7 @@ namespace asmjit if constexpr (arg_classify == arg_class::reg_rv) { g_vc->vec_dealloc(vec_type{b.id()}); + //b = Operand(); } if (arg_classify == arg_class::mem_rv && a.isReg()) @@ -2020,6 +2023,90 @@ inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false) #endif } +inline v128 gv_packss_s16(const v128& low, const v128& high) +{ +#if defined(ARCH_X64) + return _mm_packs_epi16(low, high); +#elif defined(ARCH_ARM64) + return vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); +#endif +} + +inline v128 gv_packus_s16(const v128& low, const v128& high) +{ +#if defined(ARCH_X64) + return _mm_packus_epi16(low, high); +#elif defined(ARCH_ARM64) + return vcombine_u8(vqmovun_s16(low), vqmovun_s16(high)); +#endif +} + +inline v128 gv_packus_u16(const v128& low, const v128& high) +{ +#if defined(__SSE4_1__) + return _mm_packus_epi16(_mm_min_epu16(low, _mm_set1_epi16(0xff)), _mm_min_epu16(high, _mm_set1_epi16(0xff))); +#elif defined(ARCH_X64) + return _mm_packus_epi16(_mm_sub_epi16(low, _mm_subs_epu16(low, _mm_set1_epi16(0xff))), _mm_sub_epi16(high, _mm_subs_epu16(high, _mm_set1_epi16(0xff)))); +#elif defined(ARCH_ARM64) + return vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); +#endif +} + +inline v128 gv_packtu16(const v128& low, const v128& high) +{ +#if defined(ARCH_X64) + return _mm_packus_epi16(low & _mm_set1_epi16(0xff), high & _mm_set1_epi16(0xff)); +#elif defined(ARCH_ARM64) + return vuzp1q_s8(low, high); +#endif +} + +inline v128 gv_packss_s32(const v128& low, const v128& high) +{ +#if defined(ARCH_X64) + return _mm_packs_epi32(low, high); +#elif defined(ARCH_ARM64) + return vcombine_s16(vqmovn_s32(low), vqmovn_s32(high)); +#endif +} + +inline v128 gv_packus_s32(const v128& low, const v128& high) +{ +#if defined(__SSE4_1__) + return _mm_packus_epi32(low, high); +#elif defined(ARCH_X64) + const auto s = _mm_srai_epi16(_mm_packs_epi32(low, high), 15); + const auto r = gv_add16(_mm_packs_epi32(gv_sub32(low, gv_bcst32(0x8000)), gv_sub32(high, gv_bcst32(0x8000))), gv_bcst16(0x8000)); + return gv_andn(s, r); +#elif defined(ARCH_ARM64) + return vcombine_u16(vqmovun_s32(low), vqmovun_s32(high)); +#endif +} + +inline v128 gv_packus_u32(const v128& low, const v128& high) +{ +#if defined(__SSE4_1__) + return _mm_packus_epi32(_mm_min_epu32(low, _mm_set1_epi32(0xffff)), _mm_min_epu32(high, _mm_set1_epi32(0xffff))); +#elif defined(ARCH_X64) + const v128 s = _mm_cmpgt_epi16(_mm_packs_epi32(_mm_srli_epi32(low, 16), _mm_srli_epi32(high, 16)), _mm_setzero_si128()); + const v128 r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); + return _mm_or_si128(r, s); +#elif defined(ARCH_ARM64) + return vcombine_u16(vqmovn_u32(low), vqmovn_u32(high)); +#endif +} + +inline v128 gv_packtu32(const v128& low, const v128& high) +{ +#if defined(__SSE4_1__) + return _mm_packus_epi32(low & _mm_set1_epi32(0xffff), high & _mm_set1_epi32(0xffff)); +#elif defined(ARCH_X64) + return _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(low, 16), 16), _mm_srai_epi32(_mm_slli_epi32(high, 16), 16)); +#elif defined(ARCH_ARM64) + return vuzp1q_s16(low, high); +#endif +} + inline v128 gv_unpacklo8(const v128& lows, const v128& highs) { #if defined(ARCH_X64)