From 4aee4ed6d7a110faa1d4d80a1166d1fbc71a3008 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 9 Dec 2017 17:57:43 +0300 Subject: [PATCH] SPU: remove SSSE3 dependency --- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 444 +++++++++++++++++++++---- rpcs3/Emu/Cell/SPUASMJITRecompiler.h | 1 + rpcs3/Emu/Cell/SPUInterpreter.cpp | 174 ++++++++-- rpcs3/Emu/Cell/SPUInterpreter.h | 34 +- rpcs3/Emu/Cell/SPUOpcodes.h | 8 +- rpcs3/Emu/Cell/SPUThread.cpp | 91 ++++- 6 files changed, 639 insertions(+), 113 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 6f907cd50a..6a1228466c 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -6,6 +6,7 @@ #include "SPUThread.h" #include "SPUInterpreter.h" #include "SPUASMJITRecompiler.h" +#include "Utilities/sysinfo.h" #include @@ -20,7 +21,7 @@ #define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__)) #define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__)) -const spu_decoder s_spu_interpreter; // TODO: remove +extern const spu_decoder g_spu_interpreter_fast; // TODO: avoid const spu_decoder s_spu_decoder; spu_recompiler::spu_recompiler() @@ -101,6 +102,8 @@ void spu_recompiler::compile(spu_function_t& f) this->qw1 = &qw1_var; X86Gp qw2_var = compiler.newUInt64("qw2"); this->qw2 = &qw2_var; + X86Gp qw3_var = compiler.newUInt64("qw3"); + this->qw3 = &qw3_var; std::array vec_vars; @@ -236,7 +239,7 @@ void spu_recompiler::compile(spu_function_t& f) m_jit->add(&fn, codeHolder); f.compiled = asmjit::Internal::ptr_cast(fn); - + if (g_cfg.core.spu_debug) { // Add ASMJIT logs @@ -351,7 +354,7 @@ void spu_recompiler::InterpreterCall(spu_opcode_t op) asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(gate)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); call->setArg(0, *cpu); call->setArg(1, asmjit::imm_u(op.opcode)); - call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast(s_spu_interpreter.decode(op.opcode)))); + call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast(g_spu_interpreter_fast.decode(op.opcode)))); call->setRet(0, *addr); // return immediately if an error occured @@ -408,7 +411,7 @@ void spu_recompiler::FunctionCall() if (_spu->pc == link) { _spu->recursion_level--; - return 0; // Successfully returned + return 0; // Successfully returned } } @@ -1029,9 +1032,24 @@ void spu_recompiler::STQX(spu_opcode_t op) c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); c->and_(*addr, 0x3fff0); - const XmmLink& vt = XmmGet(op.rt, XmmType::Int); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmGet(op.rt, XmmType::Int); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt); + } + else + { + c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); + c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1); + c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } + c->unuse(*addr); } @@ -1079,9 +1097,8 @@ void spu_recompiler::HBR(spu_opcode_t op) void spu_recompiler::GB(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0))); - c->psllq(va, 7); - c->pmovmskb(*addr, va); + c->pslld(va, 31); + c->movmskps(*addr, va); c->pxor(va, va); c->pinsrw(va, *addr, 6); c->movdqa(SPU_OFF_128(gpr, op.rt), va); @@ -1091,8 +1108,8 @@ void spu_recompiler::GB(spu_opcode_t op) void spu_recompiler::GBH(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0))); - c->psllq(va, 7); + c->psllw(va, 15); + c->packsswb(va, XmmConst(_mm_setzero_si128())); c->pmovmskb(*addr, va); c->pxor(va, va); c->pinsrw(va, *addr, 6); @@ -1171,21 +1188,54 @@ void spu_recompiler::LQX(spu_opcode_t op) c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); c->and_(*addr, 0x3fff0); - const XmmLink& vt = XmmAlloc(); - c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr)); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr)); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + } + else + { + c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0)); + c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } + c->unuse(*addr); } void spu_recompiler::ROTQBYBI(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(32) const __m128i buf[2]{a, a}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v >> 3 & 0xf)))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); c->and_(*addr, 0xf << 3); - c->shl(*addr, 1); - c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr)); + c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1)); c->movdqa(SPU_OFF_128(gpr, op.rt), va); c->unuse(*addr); c->unuse(*qw0); @@ -1193,14 +1243,30 @@ void spu_recompiler::ROTQBYBI(spu_opcode_t op) void spu_recompiler::ROTQMBYBI(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v >> 3 & 0x1f))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->shr(*addr, 3); - c->neg(*addr); - c->and_(*addr, 0x1f); - c->shl(*addr, 4); - c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr)); + c->and_(*addr, 0x1f << 3); + c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1)); c->movdqa(SPU_OFF_128(gpr, op.rt), va); c->unuse(*addr); c->unuse(*qw0); @@ -1208,12 +1274,30 @@ void spu_recompiler::ROTQMBYBI(spu_opcode_t op) void spu_recompiler::SHLQBYBI(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v >> 3 & 0x1f)))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); c->and_(*addr, 0x1f << 3); - c->shl(*addr, 1); - c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr)); + c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1)); c->movdqa(SPU_OFF_128(gpr, op.rt), va); c->unuse(*addr); c->unuse(*qw0); @@ -1327,6 +1411,25 @@ void spu_recompiler::SHLQBI(spu_opcode_t op) void spu_recompiler::ROTQBY(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(32) const __m128i buf[2]{a, a}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v & 0xf)))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); @@ -1340,10 +1443,28 @@ void spu_recompiler::ROTQBY(spu_opcode_t op) void spu_recompiler::ROTQMBY(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v & 0x1f))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->neg(*addr); c->and_(*addr, 0x1f); c->shl(*addr, 4); c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr)); @@ -1354,6 +1475,25 @@ void spu_recompiler::ROTQMBY(spu_opcode_t op) void spu_recompiler::SHLQBY(spu_opcode_t op) { + auto body = [](u8* t, const u8* _a, u32 v) noexcept + { + const auto a = *(__m128i*)_a; + alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a}; + *(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v & 0x1f)))); + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature3(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *addr); + return; + } + const XmmLink& va = XmmGet(op.ra, XmmType::Int); c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb)); c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); @@ -1523,7 +1663,27 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op) { const int s = op.i7 & 0xf; const XmmLink& va = XmmGet(op.ra, XmmType::Int); - c->palignr(va, va, 16 - s); + const XmmLink& v2 = XmmAlloc(); + + if (s == 0) + { + } + else if (s == 4 || s == 8 || s == 12) + { + c->pshufd(va, va, ::rol8(0xE4, s / 2)); + } + else if (utils::has_ssse3()) + { + c->palignr(va, va, 16 - s); + } + else + { + c->movdqa(v2, va); + c->psrldq(va, 16 - s); + c->pslldq(v2, s); + c->por(va, v2); + } + c->movdqa(SPU_OFF_128(gpr, op.rt), va); } @@ -1588,12 +1748,25 @@ void spu_recompiler::SUMB(spu_opcode_t op) { const XmmLink& va = XmmGet(op.ra, XmmType::Int); const XmmLink& vb = XmmGet(op.rb, XmmType::Int); - const XmmLink& vi = XmmAlloc(); - c->movdqa(vi, XmmConst(_mm_set1_epi8(1))); - c->pmaddubsw(va, vi); - c->pmaddubsw(vb, vi); - c->phaddw(va, vb); - c->pshufb(va, XmmConst(_mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0))); + const XmmLink& v1 = XmmAlloc(); + const XmmLink& v2 = XmmAlloc(); + c->movdqa(v2, XmmConst(_mm_set1_epi16(0xff))); + c->movdqa(v1, va); + c->psrlw(va, 8); + c->pand(v1, v2); + c->pand(v2, vb); + c->psrlw(vb, 8); + c->paddw(va, v1); + c->paddw(vb, v2); + c->movdqa(v2, XmmConst(_mm_set1_epi32(0xffff))); + c->movdqa(v1, va); + c->psrld(va, 16); + c->pand(v1, v2); + c->pandn(v2, vb); + c->pslld(vb, 16); + c->paddw(va, v1); + c->paddw(vb, v2); + c->por(va, vb); c->movdqa(SPU_OFF_128(gpr, op.rt), va); } @@ -1657,16 +1830,24 @@ void spu_recompiler::CNTB(spu_opcode_t op) const XmmLink& va = XmmGet(op.ra, XmmType::Int); const XmmLink& v1 = XmmAlloc(); const XmmLink& vm = XmmAlloc(); + c->movdqa(vm, XmmConst(_mm_set1_epi8(0x55))); c->movdqa(v1, va); - c->psrlq(v1, 4); - c->movdqa(vm, XmmConst(_mm_set1_epi8(0xf))); c->pand(va, vm); + c->psrlq(v1, 1); c->pand(v1, vm); - c->movdqa(vm, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0))); - c->pshufb(vm, va); - c->movdqa(va, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0))); - c->pshufb(va, v1); - c->paddb(va, vm); + c->paddb(va, v1); + c->movdqa(vm, XmmConst(_mm_set1_epi8(0x33))); + c->movdqa(v1, va); + c->pand(va, vm); + c->psrlq(v1, 2); + c->pand(v1, vm); + c->paddb(va, v1); + c->movdqa(vm, XmmConst(_mm_set1_epi8(0x0f))); + c->movdqa(v1, va); + c->pand(va, vm); + c->psrlq(v1, 4); + c->pand(v1, vm); + c->paddb(va, v1); c->movdqa(SPU_OFF_128(gpr, op.rt), va); } @@ -2319,9 +2500,23 @@ void spu_recompiler::BRZ(spu_opcode_t op) void spu_recompiler::STQA(spu_opcode_t op) { - const XmmLink& vt = XmmGet(op.rt, XmmType::Int); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmGet(op.rt, XmmType::Int); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt); + } + else + { + c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); + c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0), *qw1); + c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } } void spu_recompiler::BRNZ(spu_opcode_t op) @@ -2401,9 +2596,23 @@ void spu_recompiler::BRHNZ(spu_opcode_t op) void spu_recompiler::STQR(spu_opcode_t op) { - const XmmLink& vt = XmmGet(op.rt, XmmType::Int); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmGet(op.rt, XmmType::Int); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt); + } + else + { + c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); + c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0), *qw1); + c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } } void spu_recompiler::BRA(spu_opcode_t op) @@ -2431,10 +2640,24 @@ void spu_recompiler::BRA(spu_opcode_t op) void spu_recompiler::LQA(spu_opcode_t op) { - const XmmLink& vt = XmmAlloc(); - c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16))); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16))); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + } + else + { + c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0)); + c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } } void spu_recompiler::BRASL(spu_opcode_t op) @@ -2516,10 +2739,24 @@ void spu_recompiler::BRSL(spu_opcode_t op) void spu_recompiler::LQR(spu_opcode_t op) { - const XmmLink& vt = XmmAlloc(); - c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16))); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16))); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + } + else + { + c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0)); + c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } } void spu_recompiler::IL(spu_opcode_t op) @@ -2630,9 +2867,24 @@ void spu_recompiler::STQD(spu_opcode_t op) if (op.si10) c->add(*addr, op.si10 << 4); c->and_(*addr, 0x3fff0); - const XmmLink& vt = XmmGet(op.rt, XmmType::Int); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmGet(op.rt, XmmType::Int); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt); + } + else + { + c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0)); + c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1); + c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } + c->unuse(*addr); } @@ -2642,10 +2894,25 @@ void spu_recompiler::LQD(spu_opcode_t op) if (op.si10) c->add(*addr, op.si10 << 4); c->and_(*addr, 0x3fff0); - const XmmLink& vt = XmmAlloc(); - c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr)); - c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); - c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + if (utils::has_ssse3()) + { + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr)); + c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f))); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); + } + else + { + c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0)); + c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8)); + c->bswap(*qw0); + c->bswap(*qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1); + c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0); + c->unuse(*qw0); + c->unuse(*qw1); + } + c->unuse(*addr); } @@ -2814,6 +3081,61 @@ void spu_recompiler::SELB(spu_opcode_t op) void spu_recompiler::SHUFB(spu_opcode_t op) { + alignas(16) static thread_local u8 s_lut[256] + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + }; + + auto body = [](u8* t, const u8* a, const u8* b, const u8* c) noexcept + { + __m128i _a = *(__m128i*)a; + __m128i _b = *(__m128i*)b; + _mm_store_si128((__m128i*)(s_lut + 0x00), _a); + _mm_store_si128((__m128i*)(s_lut + 0x10), _b); + _mm_store_si128((__m128i*)(s_lut + 0x20), _a); + _mm_store_si128((__m128i*)(s_lut + 0x30), _b); + _mm_store_si128((__m128i*)(s_lut + 0x40), _a); + _mm_store_si128((__m128i*)(s_lut + 0x50), _b); + _mm_store_si128((__m128i*)(s_lut + 0x60), _a); + _mm_store_si128((__m128i*)(s_lut + 0x70), _b); + v128 mask = v128::fromV(_mm_xor_si128(*(__m128i*)c, _mm_set1_epi8(0xf))); + + for (int i = 0; i < 16; i++) + { + t[i] = s_lut[mask._u8[i]]; + } + }; + + if (!utils::has_ssse3()) + { + c->lea(*qw0, SPU_OFF_128(gpr, op.rt4)); + c->lea(*qw1, SPU_OFF_128(gpr, op.ra)); + c->lea(*qw2, SPU_OFF_128(gpr, op.rb)); + c->lea(*qw3, SPU_OFF_128(gpr, op.rc)); + asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast(body)), asmjit::FuncSignature4(asmjit::CallConv::kIdHost)); + call->setArg(0, *qw0); + call->setArg(1, *qw1); + call->setArg(2, *qw2); + call->setArg(3, *qw3); + return; + } + const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask const XmmLink& v1 = XmmAlloc(); const XmmLink& v2 = XmmAlloc(); diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index c3e879cbf8..d79d5c1360 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -37,6 +37,7 @@ private: asmjit::X86Gp* qw0; asmjit::X86Gp* qw1; asmjit::X86Gp* qw2; + asmjit::X86Gp* qw3; std::array vec; // labels: diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index f7e5a13949..58b41c6084 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -8,6 +8,10 @@ #include #include +#if !defined(_MSC_VER) && !defined(__SSSE3__) +#define _mm_shuffle_epi8 +#endif + // Compare 16 packed unsigned bytes (greater than) inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B) { @@ -73,7 +77,7 @@ void spu_interpreter::LNOP(SPUThread& spu, spu_opcode_t op) // This instruction must be used following a store instruction that modifies the instruction stream. void spu_interpreter::SYNC(SPUThread& spu, spu_opcode_t op) { - _mm_mfence(); + _mm_mfence(); } // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. @@ -398,12 +402,12 @@ void spu_interpreter::HBR(SPUThread& spu, spu_opcode_t op) void spu_interpreter::GB(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)), 7))); + spu.gpr[op.rt] = v128::from32r(_mm_movemask_ps(_mm_castsi128_ps(_mm_slli_epi32(spu.gpr[op.ra].vi, 31)))); } void spu_interpreter::GBH(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)), 7))); + spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_packs_epi16(_mm_slli_epi16(spu.gpr[op.ra].vi, 15), _mm_setzero_si128()))); } void spu_interpreter::GBB(SPUThread& spu, spu_opcode_t op) @@ -442,17 +446,38 @@ void spu_interpreter::LQX(SPUThread& spu, spu_opcode_t op) spu.gpr[op.rt] = spu._ref((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0); } -void spu_interpreter::ROTQBYBI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(32) const __m128i buf[2]{a, a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] >> 3 & 0xf)))); +} + +void spu_interpreter_fast::ROTQBYBI(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi); } -void spu_interpreter::ROTQMBYBI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-(spu.gpr[op.rb]._s32[3] >> 3) & 0x1f].vi); + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f))); } -void spu_interpreter::SHLQBYBI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_fast::ROTQMBYBI(SPUThread& spu, spu_opcode_t op) +{ + spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] >> 3 & 0x1f].vi); +} + +void spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f)))); +} + +void spu_interpreter_fast::SHLQBYBI(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi); } @@ -509,7 +534,7 @@ void spu_interpreter::ROTQBI(SPUThread& spu, spu_opcode_t op) { const auto a = spu.gpr[op.ra].vi; const s32 n = spu.gpr[op.rb]._s32[3] & 0x7; - spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n)); + spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n)); } void spu_interpreter::ROTQMBI(SPUThread& spu, spu_opcode_t op) @@ -526,17 +551,38 @@ void spu_interpreter::SHLQBI(SPUThread& spu, spu_opcode_t op) spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n)); } -void spu_interpreter::ROTQBY(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(32) const __m128i buf[2]{a, a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] & 0xf)))); +} + +void spu_interpreter_fast::ROTQBY(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi); } -void spu_interpreter::ROTQMBY(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-spu.gpr[op.rb]._s32[3] & 0x1f].vi); + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] & 0x1f))); } -void spu_interpreter::SHLQBY(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_fast::ROTQMBY(SPUThread& spu, spu_opcode_t op) +{ + spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] & 0x1f].vi); +} + +void spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] & 0x1f)))); +} + +void spu_interpreter_fast::SHLQBY(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi); } @@ -598,7 +644,7 @@ void spu_interpreter::ROTQBII(SPUThread& spu, spu_opcode_t op) { const auto a = spu.gpr[op.ra].vi; const s32 n = op.i7 & 0x7; - spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n)); + spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n)); } void spu_interpreter::ROTQMBII(SPUThread& spu, spu_opcode_t op) @@ -615,17 +661,38 @@ void spu_interpreter::SHLQBII(SPUThread& spu, spu_opcode_t op) spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n)); } -void spu_interpreter::ROTQBYI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(32) const __m128i buf[2]{a, a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (op.i7 & 0xf)))); +} + +void spu_interpreter_fast::ROTQBYI(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi); } -void spu_interpreter::ROTQMBYI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op) { - spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[0-op.i7 & 0x1f].vi); + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (op.i7 & 0x1f))); } -void spu_interpreter::SHLQBYI(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_fast::ROTQMBYI(SPUThread& spu, spu_opcode_t op) +{ + spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[op.i7 & 0x1f].vi); +} + +void spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op) +{ + const auto a = spu.gpr[op.ra].vi; + alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a}; + spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (op.i7 & 0x1f)))); +} + +void spu_interpreter_fast::SHLQBYI(SPUThread& spu, spu_opcode_t op) { spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi); } @@ -661,10 +728,21 @@ void spu_interpreter::CGTB(SPUThread& spu, spu_opcode_t op) void spu_interpreter::SUMB(SPUThread& spu, spu_opcode_t op) { - const auto ones = _mm_set1_epi8(1); - const auto a = _mm_maddubs_epi16(spu.gpr[op.ra].vi, ones); - const auto b = _mm_maddubs_epi16(spu.gpr[op.rb].vi, ones); - spu.gpr[op.rt].vi = _mm_shuffle_epi8(_mm_hadd_epi16(a, b), _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0)); + const auto m1 = _mm_set1_epi16(0xff); + const auto m2 = _mm_set1_epi32(0xffff); + const auto a = spu.gpr[op.ra].vi; + const auto b = spu.gpr[op.rb].vi; + const auto a1 = _mm_srli_epi16(a, 8); + const auto a2 = _mm_and_si128(a, m1); + const auto b1 = _mm_srli_epi16(b, 8); + const auto b2 = _mm_and_si128(b, m1); + const auto sa = _mm_add_epi16(a1, a2); + const auto sb = _mm_add_epi16(b1, b2); + const auto s2 = _mm_and_si128(sa, m2); + const auto s1 = _mm_srli_epi32(sa, 16); + const auto s4 = _mm_andnot_si128(m2, sb); + const auto s3 = _mm_slli_epi32(sb, 16); + spu.gpr[op.rt].vi = _mm_or_si128(_mm_add_epi16(s1, s2), _mm_add_epi16(s3, s4)); } void spu_interpreter::HGT(SPUThread& spu, spu_opcode_t op) @@ -696,10 +774,14 @@ void spu_interpreter::XSHW(SPUThread& spu, spu_opcode_t op) void spu_interpreter::CNTB(SPUThread& spu, spu_opcode_t op) { - const auto counts = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); - const auto mask = _mm_set1_epi8(0xf); const auto a = spu.gpr[op.ra].vi; - spu.gpr[op.rt].vi = _mm_add_epi8(_mm_shuffle_epi8(counts, _mm_and_si128(a, mask)), _mm_shuffle_epi8(counts, _mm_and_si128(_mm_srli_epi64(a, 4), mask))); + const auto mask1 = _mm_set1_epi8(0x55); + const auto sum1 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(a, 1), mask1), _mm_and_si128(a, mask1)); + const auto mask2 = _mm_set1_epi8(0x33); + const auto sum2 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum1, 2), mask2), _mm_and_si128(sum1, mask2)); + const auto mask3 = _mm_set1_epi8(0x0f); + const auto sum3 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum2, 4), mask3), _mm_and_si128(sum2, mask3)); + spu.gpr[op.rt].vi = sum3; } void spu_interpreter::XSBH(SPUThread& spu, spu_opcode_t op) @@ -1354,7 +1436,49 @@ void spu_interpreter::SELB(SPUThread& spu, spu_opcode_t op) spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | v128::andnot(spu.gpr[op.rc], spu.gpr[op.ra]); } -void spu_interpreter::SHUFB(SPUThread& spu, spu_opcode_t op) +void spu_interpreter_precise::SHUFB(SPUThread& spu, spu_opcode_t op) +{ + alignas(16) static thread_local u8 s_lut[256] + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + }; + + const auto _a = spu.gpr[op.ra].vi; + const auto _b = spu.gpr[op.rb].vi; + _mm_store_si128((__m128i*)(s_lut + 0x00), _a); + _mm_store_si128((__m128i*)(s_lut + 0x10), _b); + _mm_store_si128((__m128i*)(s_lut + 0x20), _a); + _mm_store_si128((__m128i*)(s_lut + 0x30), _b); + _mm_store_si128((__m128i*)(s_lut + 0x40), _a); + _mm_store_si128((__m128i*)(s_lut + 0x50), _b); + _mm_store_si128((__m128i*)(s_lut + 0x60), _a); + _mm_store_si128((__m128i*)(s_lut + 0x70), _b); + v128 mask = v128::fromV(_mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi8(0xf))); + auto& t = spu.gpr[op.rt4]; + + for (int i = 0; i < 16; i++) + { + t._u8[i] = s_lut[mask._u8[i]]; + } +} + +void spu_interpreter_fast::SHUFB(SPUThread& spu, spu_opcode_t op) { const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f)); const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index); diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index 68c0d4f9f9..55779683ea 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -66,9 +66,6 @@ struct spu_interpreter static void FSMH(SPUThread&, spu_opcode_t); static void FSMB(SPUThread&, spu_opcode_t); static void LQX(SPUThread&, spu_opcode_t); - static void ROTQBYBI(SPUThread&, spu_opcode_t); - static void ROTQMBYBI(SPUThread&, spu_opcode_t); - static void SHLQBYBI(SPUThread&, spu_opcode_t); static void CBX(SPUThread&, spu_opcode_t); static void CHX(SPUThread&, spu_opcode_t); static void CWX(SPUThread&, spu_opcode_t); @@ -76,9 +73,6 @@ struct spu_interpreter static void ROTQBI(SPUThread&, spu_opcode_t); static void ROTQMBI(SPUThread&, spu_opcode_t); static void SHLQBI(SPUThread&, spu_opcode_t); - static void ROTQBY(SPUThread&, spu_opcode_t); - static void ROTQMBY(SPUThread&, spu_opcode_t); - static void SHLQBY(SPUThread&, spu_opcode_t); static void ORX(SPUThread&, spu_opcode_t); static void CBD(SPUThread&, spu_opcode_t); static void CHD(SPUThread&, spu_opcode_t); @@ -87,9 +81,6 @@ struct spu_interpreter static void ROTQBII(SPUThread&, spu_opcode_t); static void ROTQMBII(SPUThread&, spu_opcode_t); static void SHLQBII(SPUThread&, spu_opcode_t); - static void ROTQBYI(SPUThread&, spu_opcode_t); - static void ROTQMBYI(SPUThread&, spu_opcode_t); - static void SHLQBYI(SPUThread&, spu_opcode_t); static void NOP(SPUThread&, spu_opcode_t); static void CGT(SPUThread&, spu_opcode_t); static void XOR(SPUThread&, spu_opcode_t); @@ -175,7 +166,6 @@ struct spu_interpreter static void HBRR(SPUThread&, spu_opcode_t); static void ILA(SPUThread&, spu_opcode_t); static void SELB(SPUThread&, spu_opcode_t); - static void SHUFB(SPUThread&, spu_opcode_t); static void MPYA(SPUThread&, spu_opcode_t); static void DFCGT(SPUThread&, spu_opcode_t); static void DFCMGT(SPUThread&, spu_opcode_t); @@ -186,6 +176,17 @@ struct spu_interpreter struct spu_interpreter_fast final : spu_interpreter { + static void ROTQBYBI(SPUThread&, spu_opcode_t); + static void ROTQMBYBI(SPUThread&, spu_opcode_t); + static void SHLQBYBI(SPUThread&, spu_opcode_t); + static void ROTQBY(SPUThread&, spu_opcode_t); + static void ROTQMBY(SPUThread&, spu_opcode_t); + static void SHLQBY(SPUThread&, spu_opcode_t); + static void ROTQBYI(SPUThread&, spu_opcode_t); + static void ROTQMBYI(SPUThread&, spu_opcode_t); + static void SHLQBYI(SPUThread&, spu_opcode_t); + static void SHUFB(SPUThread&, spu_opcode_t); + static void FREST(SPUThread&, spu_opcode_t); static void FRSQEST(SPUThread&, spu_opcode_t); static void FCGT(SPUThread&, spu_opcode_t); @@ -218,6 +219,17 @@ struct spu_interpreter_fast final : spu_interpreter struct spu_interpreter_precise final : spu_interpreter { + static void ROTQBYBI(SPUThread&, spu_opcode_t); + static void ROTQMBYBI(SPUThread&, spu_opcode_t); + static void SHLQBYBI(SPUThread&, spu_opcode_t); + static void ROTQBY(SPUThread&, spu_opcode_t); + static void ROTQMBY(SPUThread&, spu_opcode_t); + static void SHLQBY(SPUThread&, spu_opcode_t); + static void ROTQBYI(SPUThread&, spu_opcode_t); + static void ROTQMBYI(SPUThread&, spu_opcode_t); + static void SHLQBYI(SPUThread&, spu_opcode_t); + static void SHUFB(SPUThread&, spu_opcode_t); + static void FREST(SPUThread&, spu_opcode_t); static void FRSQEST(SPUThread&, spu_opcode_t); static void FCGT(SPUThread&, spu_opcode_t); @@ -246,4 +258,4 @@ struct spu_interpreter_precise final : spu_interpreter static void FNMS(SPUThread&, spu_opcode_t); static void FMA(SPUThread&, spu_opcode_t); static void FMS(SPUThread&, spu_opcode_t); -}; +}; \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h index e3f3618f71..f4194a3f66 100644 --- a/rpcs3/Emu/Cell/SPUOpcodes.h +++ b/rpcs3/Emu/Cell/SPUOpcodes.h @@ -41,7 +41,7 @@ static u32 spu_decode(u32 inst) } // SPU decoder object. D provides functions. T is function pointer type returned. -template +template class spu_decoder { // Fast lookup table @@ -271,6 +271,12 @@ public: } } + template + spu_decoder(F&& init) : spu_decoder() + { + init(m_table); + } + const std::array& get_table() const { return m_table; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 9c419735bc..666586d8e9 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -25,6 +25,15 @@ const bool s_use_rtm = utils::has_rtm(); +const bool s_use_ssse3 = +#ifdef _MSC_VER + utils::has_ssse3(); +#elif __SSSE3__ + true; +#else + false; +#endif + #ifdef _MSC_VER bool operator ==(const u128& lhs, const u128& rhs) { @@ -37,10 +46,60 @@ extern u64 get_system_time(); extern thread_local u64 g_tls_fault_spu; -const spu_decoder s_spu_interpreter_precise; -const spu_decoder s_spu_interpreter_fast; +// Table of identical interpreter functions when precise contains SSE2 version, and fast contains SSSE3 functions +const std::pair s_spu_dispatch_table[] +{ +#define FUNC(x) {&spu_interpreter_precise::x, &spu_interpreter_fast::x} + FUNC(ROTQBYBI), + FUNC(ROTQMBYBI), + FUNC(SHLQBYBI), + FUNC(ROTQBY), + FUNC(ROTQMBY), + FUNC(SHLQBY), + FUNC(ROTQBYI), + FUNC(ROTQMBYI), + FUNC(SHLQBYI), + FUNC(SHUFB), +#undef FUNC +}; -std::atomic g_num_spu_threads = { 0ull }; +extern const spu_decoder g_spu_interpreter_precise([](auto& table) +{ + if (s_use_ssse3) + { + for (auto& func : table) + { + for (const auto& pair : s_spu_dispatch_table) + { + if (pair.first == func) + { + func = pair.second; + break; + } + } + } + } +}); + +extern const spu_decoder g_spu_interpreter_fast([](auto& table) +{ + if (!s_use_ssse3) + { + for (auto& func : table) + { + for (const auto& pair : s_spu_dispatch_table) + { + if (pair.second == func) + { + func = pair.first; + break; + } + } + } + } +}); + +std::atomic g_num_spu_threads{0ull}; template <> void fmt_class_string::format(std::string& out, u64 arg) @@ -200,9 +259,11 @@ spu_imm_table_t::spu_imm_table_t() for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++) { + const u32 im = (0u - i) & 0x1f; + for (u32 j = 0; j < 16; j++) { - srdq_pshufb[i]._u8[j] = (j + i > 15) ? 0xff : static_cast(j + i); + srdq_pshufb[i]._u8[j] = (j + im > 15) ? 0xff : static_cast(j + im); } } @@ -314,7 +375,7 @@ extern thread_local std::string(*g_tls_log_prefix)(); void SPUThread::cpu_task() { std::fesetround(FE_TOWARDZERO); - + if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { if (!spu_db) spu_db = fxm::get_always(); @@ -330,8 +391,8 @@ void SPUThread::cpu_task() // Select opcode table const auto& table = *( - g_cfg.core.spu_decoder == spu_decoder_type::precise ? &s_spu_interpreter_precise.get_table() : - g_cfg.core.spu_decoder == spu_decoder_type::fast ? &s_spu_interpreter_fast.get_table() : + g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() : + g_cfg.core.spu_decoder == spu_decoder_type::fast ? &g_spu_interpreter_fast.get_table() : (fmt::throw_exception("Invalid SPU decoder"), nullptr)); // LS base address @@ -803,7 +864,7 @@ void SPUThread::process_mfc_cmd() do_dma_transfer(ch_mfc_cmd, false); return; } - + break; } case MFC_PUTL_CMD: @@ -831,7 +892,7 @@ void SPUThread::process_mfc_cmd() be_t ts; be_t ea; }; - + u32 total_size = 0; while (ch_mfc_cmd.size && total_size <= max_imm_dma_size) @@ -1156,7 +1217,7 @@ bool SPUThread::get_ch_value(u32 ch, u32& out) thread_ctrl::wait_for(100); } - + out = res; return true; } @@ -1184,7 +1245,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) srr0 = value; break; } - + case SPU_WrOutIntrMbox: { if (offset >= RAW_SPU_BASE_ADDR) @@ -1202,7 +1263,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT); return true; } - + const u32 code = value >> 24; { if (code < 64) @@ -1392,7 +1453,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) else { auto mfc = fxm::check_unlocked(); - + //if (test(mfc->state, cpu_flag::is_waiting)) { mfc->notify(); @@ -1447,7 +1508,7 @@ bool SPUThread::set_ch_value(u32 ch, u32 value) if (atomic_storage::btr(ch_stall_mask.raw(), value)) { auto mfc = fxm::check_unlocked(); - + //if (test(mfc->state, cpu_flag::is_waiting)) { mfc->notify(); @@ -1687,7 +1748,7 @@ bool SPUThread::stop_and_signal(u32 code) } semaphore_lock lock(group->mutex); - + if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING) { group->run_state = SPU_THREAD_GROUP_STATUS_RUNNING;