From f50e3cf5fe145d2ee3fac77d481a6e7d182c80c0 Mon Sep 17 00:00:00 2001 From: hrydgard Date: Sun, 17 Jan 2010 11:47:35 +0000 Subject: [PATCH] Add code for frsqrtex to JIT. Disable the table-based implementation in the interpreter until we find something that it actually fixes, so far it seems like it breaks stuff. Assorted cleanup around the JIT of flags that we don't need anymore. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4867 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Interpreter/Interpreter_FloatingPoint.cpp | 8 +++++ Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 1 - Source/Core/Core/Src/PowerPC/Jit64/Jit.h | 2 +- .../Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 21 ++++++++++++- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 4 +-- .../Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 22 ++++++++----- .../Src/PowerPC/Jit64/Jit_SystemRegisters.cpp | 1 - Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp | 3 -- Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h | 2 -- .../Src/PowerPC/JitCommon/JitAsmCommon.cpp | 9 +++--- .../Core/Src/PowerPC/JitCommon/JitCache.h | 7 ----- .../Core/Src/PowerPC/JitCommon/Jit_Util.cpp | 31 +++++++++++++++++++ .../Core/Src/PowerPC/JitCommon/Jit_Util.h | 3 ++ 13 files changed, 83 insertions(+), 31 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index 3b9339bcf0..120e718d6b 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -405,6 +405,8 @@ void fresx(UGeckoInstruction _inst) if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD)); } +// #define USE_ACCURATE_FRSQRTEX + void frsqrtex(UGeckoInstruction _inst) { double b = rPS0(_inst.FB); @@ -415,6 +417,7 @@ void frsqrtex(UGeckoInstruction _inst) } else { +#ifdef USE_ACCURATE_FRSQRTEX if (b == 0.0) { SetFPException(FPSCR_ZX); riPS0(_inst.FD) = 0x7ff0000000000000; @@ -436,6 +439,11 @@ void frsqrtex(UGeckoInstruction _inst) outa |= frsqrtex_lut[idx] >> 12; riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb; } +#else + if (b == 0.0) + SetFPException(FPSCR_ZX); + rPS0(_inst.FD) = ForceDouble(1.0 / sqrt(b)); +#endif } UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index c6c82fc08c..374e23bef8 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -443,7 +443,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.curBlock = b; - js.blockSetsQuantizers = false; js.block_flags = 0; js.cancel = false; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index 40d68e0655..f79c3510bd 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -110,7 +110,6 @@ private: int block_flags; bool isLastInstruction; - bool blockSetsQuantizers; int fifoBytesThisBlock; @@ -247,6 +246,7 @@ public: void ps_muls(UGeckoInstruction inst); void fp_arith_s(UGeckoInstruction inst); + void frsqrtex(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst); void fmrx(UGeckoInstruction inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index 95b06e85b9..663fc67891 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -71,6 +71,9 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEm fpr.UnlockAll(); } + +static const double one_const = 1.0f; + void Jit64::fp_arith_s(UGeckoInstruction inst) { INSTRUCTION_START @@ -79,9 +82,23 @@ void Jit64::fp_arith_s(UGeckoInstruction inst) Default(inst); return; } + if (inst.SUBOP5 == 26) { + // frsqrtex + int d = inst.FD; + int b = inst.FB; + fpr.Lock(b, d); + fpr.LoadToX64(d, true, true); + MOVSD(XMM0, M((void *)&one_const)); + SQRTSD(XMM1, fpr.R(b)); + DIVSD(XMM0, R(XMM1)); + MOVSD(fpr.R(d), XMM0); + fpr.UnlockAll(); + return; + } + if (inst.SUBOP5 != 18 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21 && inst.SUBOP5 != 25) { - Default(inst); return; + Default(inst); return; } // Only the interpreter has "proper" support for (some) FP flags @@ -253,3 +270,5 @@ void Jit64::fcmpx(UGeckoInstruction inst) SetJumpTarget(continue3); fpr.UnlockAll(); } + + diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index ae3d091720..793163cca3 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -275,9 +275,7 @@ void Jit64::stfs(UGeckoInstruction inst) MOV(32, gpr.R(a), R(ABI_PARAM2)); } CVTSD2SS(XMM0, fpr.R(s)); - MOVSS(M(&temp32), XMM0); - MOV(32, R(ABI_PARAM1), M(&temp32)); - SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0); + SafeWriteFloatToReg(XMM0, ABI_PARAM2); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 2238cd627e..650f4d1b86 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -51,9 +51,8 @@ void Jit64::psq_st(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(LoadStorePaired) - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - if (js.blockSetsQuantizers || !inst.RA) + if (!inst.RA) { // TODO: Support these cases if it becomes necessary. Default(inst); @@ -105,12 +104,13 @@ void Jit64::psq_st(UGeckoInstruction inst) MOV(32, gpr.R(a), R(ECX)); MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + inst.I])); MOVZX(32, 8, EDX, R(AL)); - // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! + // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32] without a base register! #ifdef _M_IX86 - SHL(32, R(EDX), Imm8(2)); + int addr_shift = 2; #else - SHL(32, R(EDX), Imm8(3)); + int addr_shift = 3; #endif + SHL(32, R(EDX), Imm8(addr_shift)); if (inst.W) { // One value XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. @@ -130,14 +130,20 @@ void Jit64::psq_l(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(LoadStorePaired) - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - - if (js.blockSetsQuantizers || !inst.RA || inst.W) + if (!inst.RA) { Default(inst); return; } + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + + if (inst.W) { + // PanicAlert("Single ps load: %i %i", gqr.ST_TYPE, gqr.ST_SCALE); + Default(inst); + return; + } + bool update = inst.OPCD == 57; int offset = inst.SIMM_12; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp index 8db22dff9e..40990a7f52 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -52,7 +52,6 @@ void Jit64::mtspr(UGeckoInstruction inst) case SPR_GQR0 + 5: case SPR_GQR0 + 6: case SPR_GQR0 + 7: - js.blockSetsQuantizers = true; // Prevent recompiler from compiling in old quantizer values. // If the value changed, destroy all blocks using this quantizer // This will create a little bit of block churn, but hopefully not too bad. diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp index 4f69c56c8f..c8ea9ad9a3 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp @@ -397,8 +397,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.curBlock = b; - js.blockSetsQuantizers = false; - js.block_flags = 0; js.cancel = false; //Analyze the block, collect all instructions it is made of (including inlining, @@ -464,7 +462,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB // Perform actual code generation WriteCode(); - b->flags = js.block_flags; b->codeSize = (u32)(GetCodePtr() - normalEntry); b->originalSize = size; return normalEntry; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h index b6b163a113..de85869993 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h @@ -98,10 +98,8 @@ private: UGeckoInstruction next_inst; // for easy peephole opt. int instructionNumber; int downcountAmount; - int block_flags; bool isLastInstruction; - bool blockSetsQuantizers; bool forceUnsafeLoad; int fifoBytesThisBlock; diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp index 2ffd19100a..3d34917bdd 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp @@ -299,6 +299,9 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { // Easy! const u8* storeSingleFloat = AlignCode4(); + SafeWriteFloatToReg(XMM0, ECX); + RET(); + /* if (cpu_info.bSSSE3) { PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); // TODO: SafeWriteFloat @@ -309,8 +312,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MOVSS(M(&psTemp[0]), XMM0); MOV(32, R(EAX), M(&psTemp[0])); SafeWriteRegToReg(EAX, ECX, 32, 0, true); - } - RET(); + }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii SHR(32, R(EAX), Imm8(6)); @@ -336,8 +338,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { const u8* storeSingleU16 = AlignCode4(); // Used by MKWii SHR(32, R(EAX), Imm8(6)); MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); - MULPS(XMM0, R(XMM1)); + MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h index 20f9d759bb..76c2e08290 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h @@ -42,13 +42,6 @@ #define JIT_ICACHE_INVALID_BYTE 0x14 #define JIT_ICACHE_INVALID_WORD 0x14141414 - -enum BlockFlag -{ - BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8, - BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80, -}; - // TODO(ector) - optimize this struct for size struct JitBlock { diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp index 5f9251ece1..265c121fc7 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp @@ -18,6 +18,7 @@ #include "Common.h" #include "Thunk.h" +#include "CPUDetect.h" #include "../PowerPC.h" #include "../../Core.h" #include "../../HW/GPFifo.h" @@ -139,6 +140,36 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce SetJumpTarget(arg2); } +static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static u32 GC_ALIGNED16(float_buffer); + +void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr) +{ + TEST(32, R(reg_addr), Imm32(0x0C000000)); + if (false && cpu_info.bSSSE3) { + // This path should be faster but for some reason it causes errors so I've disabled it. + FixupBranch argh = J_CC(CC_Z); + MOVSS(M(&float_buffer), xmm_value); + MOV(32, R(EAX), M(&float_buffer)); + BSWAP(32, EAX); + ABI_CallFunctionRR(thunks.ProtectFunction(((void *)&Memory::Write_U32), 2), EAX, reg_addr); + FixupBranch arg2 = J(); + SetJumpTarget(argh); + PSHUFB(xmm_value, M((void *)pbswapShuffle1x4)); + #ifdef _M_IX86 + AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK)); + MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value); + #else + MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value); + #endif + SetJumpTarget(arg2); + } else { + MOVSS(M(&float_buffer), xmm_value); + MOV(32, R(EAX), M(&float_buffer)); + SafeWriteRegToReg(EAX, reg_addr, 32, 0, true); + } +} + void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address) { #ifdef _M_X64 diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h index 4fad3db64a..dbc730df17 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h @@ -29,6 +29,9 @@ public: void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false); void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, bool swap = true); + // Trashes both inputs and EAX. + void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr); + void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address); void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void JitClearCA();