Merge pull request #1307 from comex/bitset

Higher level bitset wrapper
This commit is contained in:
comex 2014-10-28 23:39:35 -04:00
commit 089e32ba7d
31 changed files with 492 additions and 307 deletions

166
Source/Core/Common/BitSet.h Normal file
View file

@ -0,0 +1,166 @@
// This file is under the public domain.
#pragma once
#include <initializer_list>
#include <type_traits>
#include "CommonTypes.h"
// Helper functions:
#ifdef _WIN32
template <typename T>
static inline int CountSetBits(T v)
{
// from https://graphics.stanford.edu/~seander/bithacks.html
// GCC has this built in, but MSVC's intrinsic will only emit the actual
// POPCNT instruction, which we're not depending on
v = v - ((v >> 1) & (T)~(T)0/3);
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u64 val)
{
unsigned long index;
_BitScanForward64(&index, val);
return (int)index;
}
#else
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
namespace BS
{
// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
// using the set bits of an integer to represent a set of integers. Like that
// class, it acts like an array of bools:
// BitSet32 bs;
// bs[1] = true;
// but also like the underlying integer ([0] = least significant bit):
// BitSet32 bs2 = ...;
// bs = (bs ^ bs2) & BitSet32(0xffff);
// The following additional functionality is provided:
// - Construction using an initializer list.
// BitSet bs { 1, 2, 4, 8 };
// - Efficiently iterating through the set bits:
// for (int i : bs)
// [i is the *index* of a set bit]
// (This uses the appropriate CPU instruction to find the next set bit in one
// operation.)
// - Counting set bits using .Count() - see comment on that method.
// TODO: use constexpr when MSVC gets out of the Dark Ages
template <typename IntTy>
class BitSet
{
static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
public:
// A reference to a particular bit, returned from operator[].
class Ref
{
public:
Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
operator bool() const { return (m_bs->m_val & m_mask) != 0; }
bool operator=(bool set)
{
m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
return set;
}
private:
BitSet* m_bs;
IntTy m_mask;
};
// A STL-like iterator is required to be able to use range-based for loops.
class Iterator
{
public:
Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
int operator*() { return m_bit; }
Iterator& operator++()
{
if (m_val == 0)
{
m_bit = -1;
}
else
{
int bit = LeastSignificantSetBit(m_val);
m_val &= ~(1 << bit);
m_bit = bit;
}
return *this;
}
Iterator operator++(int _)
{
Iterator other(*this);
++*this;
return other;
}
bool operator==(Iterator other) const { return m_bit == other.m_bit; }
bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
private:
IntTy m_val;
int m_bit;
};
BitSet() : m_val(0) {}
explicit BitSet(IntTy val) : m_val(val) {}
BitSet(std::initializer_list<int> init)
{
m_val = 0;
for (int bit : init)
m_val |= (IntTy)1 << bit;
}
static BitSet AllTrue(size_t count)
{
return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
}
Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
bool operator==(BitSet other) const { return m_val == other.m_val; }
bool operator!=(BitSet other) const { return m_val != other.m_val; }
BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
BitSet operator~() const { return BitSet(~m_val); }
BitSet& operator|=(BitSet other) { return *this = *this | other; }
BitSet& operator&=(BitSet other) { return *this = *this & other; }
BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
operator u32() = delete;
operator bool() { return m_val != 0; }
// Warning: Even though on modern CPUs this is a single fast instruction,
// Dolphin's official builds do not currently assume POPCNT support on x86,
// so slower explicit bit twiddling is generated. Still should generally
// be faster than a loop.
unsigned int Count() const { return CountSetBits(m_val); }
Iterator begin() const { Iterator it(m_val, 0); return ++it; }
Iterator end() const { return Iterator(m_val, -1); }
IntTy m_val;
};
}
typedef BS::BitSet<u32> BitSet32;
typedef BS::BitSet<u64> BitSet64;

View file

@ -39,6 +39,7 @@
<ClInclude Include="Atomic_GCC.h" /> <ClInclude Include="Atomic_GCC.h" />
<ClInclude Include="Atomic_Win32.h" /> <ClInclude Include="Atomic_Win32.h" />
<ClInclude Include="BitField.h" /> <ClInclude Include="BitField.h" />
<ClInclude Include="BitSet.h" />
<ClInclude Include="BreakPoints.h" /> <ClInclude Include="BreakPoints.h" />
<ClInclude Include="CDUtils.h" /> <ClInclude Include="CDUtils.h" />
<ClInclude Include="ChunkFile.h" /> <ClInclude Include="ChunkFile.h" />

View file

@ -13,6 +13,7 @@
<ClInclude Include="Atomic_GCC.h" /> <ClInclude Include="Atomic_GCC.h" />
<ClInclude Include="Atomic_Win32.h" /> <ClInclude Include="Atomic_Win32.h" />
<ClInclude Include="BitField.h" /> <ClInclude Include="BitField.h" />
<ClInclude Include="BitSet.h" />
<ClInclude Include="BreakPoints.h" /> <ClInclude Include="BreakPoints.h" />
<ClInclude Include="CDUtils.h" /> <ClInclude Include="CDUtils.h" />
<ClInclude Include="ChunkFile.h" /> <ClInclude Include="ChunkFile.h" />

View file

@ -10,31 +10,23 @@ using namespace Gen;
// Shared code between Win64 and Unix64 // Shared code between Win64 and Unix64
void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
{ {
size_t shadow = 0; size_t shadow = 0;
#if defined(_WIN32) #if defined(_WIN32)
shadow = 0x20; shadow = 0x20;
#endif #endif
int count = 0; int count = (mask & ABI_ALL_GPRS).Count();
for (int r = 0; r < 16; r++)
{
if (mask & (1 << r))
count++;
}
rsp_alignment -= count * 8; rsp_alignment -= count * 8;
size_t subtraction = 0; size_t subtraction = 0;
if (mask & 0xffff0000) int fpr_count = (mask & ABI_ALL_FPRS).Count();
if (fpr_count)
{ {
// If we have any XMMs to save, we must align the stack here. // If we have any XMMs to save, we must align the stack here.
subtraction = rsp_alignment & 0xf; subtraction = rsp_alignment & 0xf;
} }
for (int x = 0; x < 16; x++) subtraction += 16 * fpr_count;
{
if (mask & (1 << (16 + x)))
subtraction += 16;
}
size_t xmm_base_subtraction = subtraction; size_t xmm_base_subtraction = subtraction;
subtraction += needed_frame_size; subtraction += needed_frame_size;
subtraction += shadow; subtraction += shadow;
@ -47,57 +39,46 @@ void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t nee
*xmm_offsetp = subtraction - xmm_base_subtraction; *xmm_offsetp = subtraction - xmm_base_subtraction;
} }
size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
{ {
size_t shadow, subtraction, xmm_offset; size_t shadow, subtraction, xmm_offset;
ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
for (int r = 0; r < 16; r++) for (int r : mask & ABI_ALL_GPRS)
{
if (mask & (1 << r))
PUSH((X64Reg) r); PUSH((X64Reg) r);
}
if (subtraction) if (subtraction)
SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
for (int x = 0; x < 16; x++) for (int x : mask & ABI_ALL_FPRS)
{ {
if (mask & (1 << (16 + x))) MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) (x - 16));
{
MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x);
xmm_offset += 16; xmm_offset += 16;
} }
}
return shadow; return shadow;
} }
void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
{ {
size_t shadow, subtraction, xmm_offset; size_t shadow, subtraction, xmm_offset;
ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
for (int x = 0; x < 16; x++) for (int x : mask & ABI_ALL_FPRS)
{ {
if (mask & (1 << (16 + x))) MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset));
{
MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset));
xmm_offset += 16; xmm_offset += 16;
} }
}
if (subtraction) if (subtraction)
ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
for (int r = 15; r >= 0; r--) for (int r = 15; r >= 0; r--)
{ {
if (mask & (1 << r)) if (mask[r])
{
POP((X64Reg) r); POP((X64Reg) r);
} }
} }
}
// Common functions // Common functions
void XEmitter::ABI_CallFunction(const void *func) void XEmitter::ABI_CallFunction(const void *func)

View file

@ -4,6 +4,7 @@
#pragma once #pragma once
#include "Common/BitSet.h"
#include "Common/x64Emitter.h" #include "Common/x64Emitter.h"
// x64 ABI:s, and helpers to help follow them when JIT-ing code. // x64 ABI:s, and helpers to help follow them when JIT-ing code.
@ -23,6 +24,9 @@
// Callee-save: RBX RBP R12 R13 R14 R15 // Callee-save: RBX RBP R12 R13 R14 R15
// Parameters: RDI RSI RDX RCX R8 R9 // Parameters: RDI RSI RDX RCX R8 R9
#define ABI_ALL_FPRS BitSet32(0xffff0000)
#define ABI_ALL_GPRS BitSet32(0x0000ffff)
#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
#define ABI_PARAM1 RCX #define ABI_PARAM1 RCX
@ -31,11 +35,9 @@
#define ABI_PARAM4 R9 #define ABI_PARAM4 R9
// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \ #define ABI_ALL_CALLER_SAVED \
(1 << R9) | (1 << R10) | (1 << R11) | \ (BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \
(1 << (XMM0+16)) | (1 << (XMM1+16)) | (1 << (XMM2+16)) | (1 << (XMM3+16)) | \ XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 })
(1 << (XMM4+16)) | (1 << (XMM5+16)))
#else //64-bit Unix / OS X #else //64-bit Unix / OS X
#define ABI_PARAM1 RDI #define ABI_PARAM1 RDI
@ -47,13 +49,12 @@
// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably // FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
// don't actually clobber them. // don't actually clobber them.
#define ABI_ALL_CALLER_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \ #define ABI_ALL_CALLER_SAVED \
(1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \ (BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \
0xffff0000 /* xmm0..15 */) ABI_ALL_FPRS)
#endif // WIN32 #endif // WIN32
#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED) #define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
#define ABI_RETURN RAX #define ABI_RETURN RAX

View file

@ -10,6 +10,7 @@
#include <cstring> #include <cstring>
#include <functional> #include <functional>
#include "Common/BitSet.h"
#include "Common/CodeBlock.h" #include "Common/CodeBlock.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
@ -302,7 +303,7 @@ private:
void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
protected: protected:
inline void Write8(u8 value) {*code++ = value;} inline void Write8(u8 value) {*code++ = value;}
@ -883,8 +884,8 @@ public:
// Saves/restores the registers and adjusts the stack to be aligned as // Saves/restores the registers and adjusts the stack to be aligned as
// required by the ABI, where the previous alignment was as specified. // required by the ABI, where the previous alignment was as specified.
// Push returns the size of the shadow space, i.e. the offset of the frame. // Push returns the size of the shadow space, i.e. the offset of the frame.
size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
inline int ABI_GetNumXMMRegs() { return 16; } inline int ABI_GetNumXMMRegs() { return 16; }

View file

@ -385,7 +385,7 @@ void DSPEmitter::CompileDispatcher()
{ {
enterDispatcher = AlignCode16(); enterDispatcher = AlignCode16();
// We don't use floating point (high 16 bits). // We don't use floating point (high 16 bits).
u32 registers_used = ABI_ALL_CALLEE_SAVED & 0xffff; BitSet32 registers_used = ABI_ALL_CALLEE_SAVED & BitSet32(0xffff);
ABI_PushRegistersAndAdjustStack(registers_used, 8); ABI_PushRegistersAndAdjustStack(registers_used, 8);
const u8 *dispatcherLoop = GetCodePtr(); const u8 *dispatcherLoop = GetCodePtr();

View file

@ -241,9 +241,9 @@ void Jit64::WriteCallInterpreter(UGeckoInstruction inst)
MOV(32, PPCSTATE(npc), Imm32(js.compilerPC + 4)); MOV(32, PPCSTATE(npc), Imm32(js.compilerPC + 4));
} }
Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst); Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst);
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC((void*)instr, inst.hex); ABI_CallFunctionC((void*)instr, inst.hex);
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
} }
void Jit64::unknown_instruction(UGeckoInstruction inst) void Jit64::unknown_instruction(UGeckoInstruction inst)
@ -260,9 +260,9 @@ void Jit64::HLEFunction(UGeckoInstruction _inst)
{ {
gpr.Flush(); gpr.Flush();
fpr.Flush(); fpr.Flush();
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex); ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex);
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
} }
void Jit64::DoNothing(UGeckoInstruction _inst) void Jit64::DoNothing(UGeckoInstruction _inst)
@ -300,18 +300,18 @@ bool Jit64::Cleanup()
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
{ {
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
did_something = true; did_something = true;
} }
// SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time. // SPEED HACK: MMCR0/MMCR1 should be checked at run-time, not at compile time.
if (MMCR0.Hex || MMCR1.Hex) if (MMCR0.Hex || MMCR1.Hex)
{ {
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst); ABI_CallFunctionCCC((void *)&PowerPC::UpdatePerformanceMonitor, js.downcountAmount, jit->js.numLoadStoreInst, jit->js.numFloatingPointInst);
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
did_something = true; did_something = true;
} }
@ -426,9 +426,9 @@ void Jit64::WriteRfiExitDestInRSCRATCH()
MOV(32, PPCSTATE(pc), R(RSCRATCH)); MOV(32, PPCSTATE(pc), R(RSCRATCH));
MOV(32, PPCSTATE(npc), R(RSCRATCH)); MOV(32, PPCSTATE(npc), R(RSCRATCH));
Cleanup(); Cleanup();
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
JMP(asm_routines.dispatcher, true); JMP(asm_routines.dispatcher, true);
} }
@ -438,9 +438,9 @@ void Jit64::WriteExceptionExit()
Cleanup(); Cleanup();
MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(32, PPCSTATE(npc), R(RSCRATCH)); MOV(32, PPCSTATE(npc), R(RSCRATCH));
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExceptions));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
JMP(asm_routines.dispatcher, true); JMP(asm_routines.dispatcher, true);
} }
@ -450,9 +450,9 @@ void Jit64::WriteExternalExceptionExit()
Cleanup(); Cleanup();
MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(32, PPCSTATE(npc), R(RSCRATCH)); MOV(32, PPCSTATE(npc), R(RSCRATCH));
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount));
JMP(asm_routines.dispatcher, true); JMP(asm_routines.dispatcher, true);
} }
@ -565,9 +565,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
if (ImHereDebug) if (ImHereDebug)
{ {
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
} }
// Conditionally add profiling code. // Conditionally add profiling code.
@ -642,7 +642,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
{ {
js.fifoBytesThisBlock -= 32; js.fifoBytesThisBlock -= 32;
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
ABI_PopRegistersAndAdjustStack(registersInUse, 0); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
@ -727,9 +727,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
fpr.Flush(); fpr.Flush();
MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
FixupBranch noBreakpoint = J_CC(CC_Z); FixupBranch noBreakpoint = J_CC(CC_Z);
@ -744,29 +744,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
// output, which needs to be bound in the actual instruction compilation. // output, which needs to be bound in the actual instruction compilation.
// TODO: make this smarter in the case that we're actually register-starved, i.e. // TODO: make this smarter in the case that we're actually register-starved, i.e.
// prioritize the more important registers. // prioritize the more important registers.
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) for (int reg : ops[i].regsIn)
{ {
int reg = ops[i].regsIn[k]; if (gpr.NumFreeRegisters() < 2)
if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm()) break;
if (ops[i].gprInReg[reg] && !gpr.R(reg).IsImm())
gpr.BindToRegister(reg, true, false); gpr.BindToRegister(reg, true, false);
} }
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) for (int reg : ops[i].regsOut)
{ {
int reg = ops[i].fregsIn[k]; if (fpr.NumFreeRegisters() < 2)
if (reg >= 0 && (ops[i].fprInXmm & (1 << reg))) break;
fpr.BindToRegister(reg, true, false); if (ops[i].fprInXmm[reg])
gpr.BindToRegister(reg, true, false);
} }
Jit64Tables::CompileInstruction(ops[i]); Jit64Tables::CompileInstruction(ops[i]);
// If we have a register that will never be used again, flush it. // If we have a register that will never be used again, flush it.
for (int j = 0; j < 32; j++) for (int j : ~ops[i].gprInUse)
{
if (!(ops[i].gprInUse & (1 << j)))
gpr.StoreFromRegister(j); gpr.StoreFromRegister(j);
if (!(ops[i].fprInUse & (1 << j))) for (int j : ~ops[i].fprInUse)
fpr.StoreFromRegister(j); fpr.StoreFromRegister(j);
}
if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
{ {
@ -852,15 +851,15 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
return normalEntry; return normalEntry;
} }
u32 Jit64::CallerSavedRegistersInUse() BitSet32 Jit64::CallerSavedRegistersInUse()
{ {
u32 result = 0; BitSet32 result;
for (int i = 0; i < NUMXREGS; i++) for (int i = 0; i < NUMXREGS; i++)
{ {
if (!gpr.IsFreeX(i)) if (!gpr.IsFreeX(i))
result |= (1 << i); result[i] = true;
if (!fpr.IsFreeX(i)) if (!fpr.IsFreeX(i))
result |= (1 << (16 + i)); result[16 + i] = true;
} }
return result & ABI_ALL_CALLER_SAVED; return result & ABI_ALL_CALLER_SAVED;
} }

View file

@ -78,7 +78,7 @@ public:
void Jit(u32 em_address) override; void Jit(u32 em_address) override;
const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b); const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b);
u32 CallerSavedRegistersInUse(); BitSet32 CallerSavedRegistersInUse();
JitBlockCache *GetBlockCache() override { return &blocks; } JitBlockCache *GetBlockCache() override { return &blocks; }

View file

@ -43,9 +43,9 @@ void Jit64AsmRoutineManager::Generate()
MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80)); MOV(64, R(RPPCSTATE), Imm64((u64)&PowerPC::ppcState + 0x80));
const u8* outerLoop = GetCodePtr(); const u8* outerLoop = GetCodePtr();
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance)); ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time
dispatcherMispredictedBLR = GetCodePtr(); dispatcherMispredictedBLR = GetCodePtr();
@ -71,9 +71,9 @@ void Jit64AsmRoutineManager::Generate()
{ {
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING)); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING));
FixupBranch notStepping = J_CC(CC_Z); FixupBranch notStepping = J_CC(CC_Z);
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckBreakPoints));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));
dbg_exit = J_CC(CC_NZ, true); dbg_exit = J_CC(CC_NZ, true);
SetJumpTarget(notStepping); SetJumpTarget(notStepping);
@ -129,9 +129,9 @@ void Jit64AsmRoutineManager::Generate()
SetJumpTarget(notfound); SetJumpTarget(notfound);
//Ok, no block, let's jit //Ok, no block, let's jit
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionA((void *)&Jit, PPCSTATE(pc)); ABI_CallFunctionA((void *)&Jit, PPCSTATE(pc));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
// Jit might have cleared the code cache // Jit might have cleared the code cache
ResetStack(); ResetStack();
@ -146,9 +146,9 @@ void Jit64AsmRoutineManager::Generate()
FixupBranch noExtException = J_CC(CC_Z); FixupBranch noExtException = J_CC(CC_Z);
MOV(32, R(RSCRATCH), PPCSTATE(pc)); MOV(32, R(RSCRATCH), PPCSTATE(pc));
MOV(32, PPCSTATE(npc), R(RSCRATCH)); MOV(32, PPCSTATE(npc), R(RSCRATCH));
ABI_PushRegistersAndAdjustStack(0, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions)); ABI_CallFunction(reinterpret_cast<void *>(&PowerPC::CheckExternalExceptions));
ABI_PopRegistersAndAdjustStack(0, 0); ABI_PopRegistersAndAdjustStack({}, 0);
SetJumpTarget(noExtException); SetJumpTarget(noExtException);
TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF));

View file

@ -95,41 +95,37 @@ void RegCache::UnlockAllX()
xreg.locked = false; xreg.locked = false;
} }
u32 GPRRegCache::GetRegUtilization() BitSet32 GPRRegCache::GetRegUtilization()
{ {
return jit->js.op->gprInReg; return jit->js.op->gprInReg;
} }
u32 FPURegCache::GetRegUtilization() BitSet32 FPURegCache::GetRegUtilization()
{ {
return jit->js.op->gprInReg; return jit->js.op->gprInReg;
} }
u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) BitSet32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead)
{ {
u32 regsUsed = 0; BitSet32 regsUsed;
for (u32 i = 1; i < lookahead; i++) for (u32 i = 1; i < lookahead; i++)
{ {
for (int j = 0; j < 3; j++) BitSet32 regsIn = jit->js.op[i].regsIn;
if (jit->js.op[i].regsIn[j] >= 0) regsUsed |= regsIn;
regsUsed |= 1 << jit->js.op[i].regsIn[j]; if (regsIn[preg])
for (int j = 0; j < 3; j++)
if ((size_t)jit->js.op[i].regsIn[j] == preg)
return regsUsed; return regsUsed;
} }
return regsUsed; return regsUsed;
} }
u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) BitSet32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead)
{ {
u32 regsUsed = 0; BitSet32 regsUsed;
for (u32 i = 1; i < lookahead; i++) for (u32 i = 1; i < lookahead; i++)
{ {
for (int j = 0; j < 4; j++) BitSet32 regsIn = jit->js.op[i].fregsIn;
if (jit->js.op[i].fregsIn[j] >= 0) regsUsed |= regsIn;
regsUsed |= 1 << jit->js.op[i].fregsIn[j]; if (regsIn[preg])
for (int j = 0; j < 4; j++)
if ((size_t)jit->js.op[i].fregsIn[j] == preg)
return regsUsed; return regsUsed;
} }
return regsUsed; return regsUsed;
@ -151,17 +147,14 @@ float RegCache::ScoreRegister(X64Reg xr)
// If the register isn't actually needed in a physical register for a later instruction, // If the register isn't actually needed in a physical register for a later instruction,
// writing it back to the register file isn't quite as bad. // writing it back to the register file isn't quite as bad.
if (GetRegUtilization() & (1 << preg)) if (GetRegUtilization()[preg])
{ {
// Don't look too far ahead; we don't want to have quadratic compilation times for // Don't look too far ahead; we don't want to have quadratic compilation times for
// enormous block sizes! // enormous block sizes!
// This actually improves register allocation a tiny bit; I'm not sure why. // This actually improves register allocation a tiny bit; I'm not sure why.
u32 lookahead = std::min(jit->js.instructionsLeft, 64); u32 lookahead = std::min(jit->js.instructionsLeft, 64);
// Count how many other registers are going to be used before we need this one again. // Count how many other registers are going to be used before we need this one again.
u32 regs_in = CountRegsIn(preg, lookahead); u32 regs_in_count = CountRegsIn(preg, lookahead).Count();
u32 regs_in_count = 0;
for (int i = 0; i < 32; i++)
regs_in_count += !!(regs_in & (1 << i));
// Totally ad-hoc heuristic to bias based on how many other registers we'll need // Totally ad-hoc heuristic to bias based on how many other registers we'll need
// before this one gets used again. // before this one gets used again.
score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count)); score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count));

View file

@ -44,8 +44,8 @@ protected:
virtual const int *GetAllocationOrder(size_t& count) = 0; virtual const int *GetAllocationOrder(size_t& count) = 0;
virtual u32 GetRegUtilization() = 0; virtual BitSet32 GetRegUtilization() = 0;
virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0; virtual BitSet32 CountRegsIn(size_t preg, u32 lookahead) = 0;
Gen::XEmitter *emit; Gen::XEmitter *emit;
@ -137,8 +137,8 @@ public:
Gen::OpArg GetDefaultLocation(size_t reg) const override; Gen::OpArg GetDefaultLocation(size_t reg) const override;
const int* GetAllocationOrder(size_t& count) override; const int* GetAllocationOrder(size_t& count) override;
void SetImmediate32(size_t preg, u32 immValue); void SetImmediate32(size_t preg, u32 immValue);
u32 GetRegUtilization(); BitSet32 GetRegUtilization() override;
u32 CountRegsIn(size_t preg, u32 lookahead); BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
}; };
@ -149,6 +149,6 @@ public:
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
const int* GetAllocationOrder(size_t& count) override; const int* GetAllocationOrder(size_t& count) override;
Gen::OpArg GetDefaultLocation(size_t reg) const override; Gen::OpArg GetDefaultLocation(size_t reg) const override;
u32 GetRegUtilization(); BitSet32 GetRegUtilization() override;
u32 CountRegsIn(size_t preg, u32 lookahead); BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
}; };

View file

@ -134,7 +134,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
TEST(32, gpr.R(d), gpr.R(d)); TEST(32, gpr.R(d), gpr.R(d));
FixupBranch noIdle = J_CC(CC_NZ); FixupBranch noIdle = J_CC(CC_NZ);
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
@ -246,11 +246,11 @@ void Jit64::lXXx(UGeckoInstruction inst)
gpr.Lock(a, b, d); gpr.Lock(a, b, d);
gpr.BindToRegister(d, js.memcheck, true); gpr.BindToRegister(d, js.memcheck, true);
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
if (update && storeAddress) if (update && storeAddress)
{ {
// We need to save the (usually scratch) address register for the update. // We need to save the (usually scratch) address register for the update.
registersInUse |= (1 << RSCRATCH2); registersInUse[RSCRATCH2] = true;
} }
SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend); SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend);
@ -314,7 +314,7 @@ void Jit64::dcbz(UGeckoInstruction inst)
SwitchToFarCode(); SwitchToFarCode();
SetJumpTarget(slow); SetJumpTarget(slow);
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); MOV(32, M(&PC), Imm32(jit->js.compilerPC));
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH); ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH);
ABI_PopRegistersAndAdjustStack(registersInUse, 0); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
@ -403,7 +403,7 @@ void Jit64::stX(UGeckoInstruction inst)
// Helps external systems know which instruction triggered the write // Helps external systems know which instruction triggered the write
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize) switch (accessSize)
{ {
@ -555,7 +555,7 @@ void Jit64::lmw(UGeckoInstruction inst)
ADD(32, R(RSCRATCH2), gpr.R(inst.RA)); ADD(32, R(RSCRATCH2), gpr.R(inst.RA));
for (int i = inst.RD; i < 32; i++) for (int i = inst.RD; i < 32; i++)
{ {
SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | (1 << RSCRATCH_EXTRA), false); SafeLoadToReg(RSCRATCH, R(RSCRATCH2), 32, (i - inst.RD) * 4, CallerSavedRegistersInUse() | BitSet32 { RSCRATCH_EXTRA }, false);
gpr.BindToRegister(i, false, true); gpr.BindToRegister(i, false, true);
MOV(32, gpr.R(i), R(RSCRATCH)); MOV(32, gpr.R(i), R(RSCRATCH));
} }

View file

@ -65,9 +65,9 @@ void Jit64::lfXXX(UGeckoInstruction inst)
offset = (s16)inst.SIMM_16; offset = (s16)inst.SIMM_16;
} }
u32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
if (update && js.memcheck) if (update && js.memcheck)
registersInUse |= (1 << RSCRATCH2); registersInUse[RSCRATCH2] = true;
SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false);
fpr.Lock(d); fpr.Lock(d);
fpr.BindToRegister(d, js.memcheck || !single); fpr.BindToRegister(d, js.memcheck || !single);

View file

@ -26,6 +26,7 @@ The register allocation is linear scan allocation.
#include <algorithm> #include <algorithm>
#include "Common/BitSet.h"
#include "Common/CPUDetect.h" #include "Common/CPUDetect.h"
#include "Common/MathUtil.h" #include "Common/MathUtil.h"
#include "Core/HW/ProcessorInterface.h" #include "Core/HW/ProcessorInterface.h"
@ -60,15 +61,15 @@ struct RegInfo
RegInfo(RegInfo&); // DO NOT IMPLEMENT RegInfo(RegInfo&); // DO NOT IMPLEMENT
}; };
static u32 regsInUse(RegInfo& R) static BitSet32 regsInUse(RegInfo& R)
{ {
u32 result = 0; BitSet32 result;
for (unsigned i = 0; i < MAX_NUMBER_OF_REGS; i++) for (unsigned i = 0; i < MAX_NUMBER_OF_REGS; i++)
{ {
if (R.regs[i] != nullptr) if (R.regs[i] != nullptr)
result |= (1 << i); result[i] = true;
if (R.fregs[i] != nullptr) if (R.fregs[i] != nullptr)
result |= (1 << (16 + i)); result[16 + i] = true;
} }
return result; return result;
} }

View file

@ -10,14 +10,11 @@
#include "Core/PowerPC/JitCommon/JitBase.h" #include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE \ #define QUANTIZED_REGS_TO_SAVE \
(ABI_ALL_CALLER_SAVED & ~(\ (ABI_ALL_CALLER_SAVED & ~BitSet32 { \
(1 << RSCRATCH) | \ RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
(1 << RSCRATCH2) | \ })
(1 << RSCRATCH_EXTRA)| \
(1 << (XMM0+16)) | \
(1 << (XMM1+16))))
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2)) #define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
using namespace Gen; using namespace Gen;

View file

@ -72,7 +72,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
return false; return false;
} }
u32 registersInUse = it->second; BitSet32 registersInUse = it->second;
if (!info.isMemoryWrite) if (!info.isMemoryWrite)
{ {
@ -98,14 +98,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
else else
{ {
// TODO: special case FIFO writes. Also, support 32-bit mode. // TODO: special case FIFO writes. Also, support 32-bit mode.
it = pcAtLoc.find(codePtr); auto it2 = pcAtLoc.find(codePtr);
if (it == pcAtLoc.end()) if (it2 == pcAtLoc.end())
{ {
PanicAlert("BackPatch: no pc entry for address %p", codePtr); PanicAlert("BackPatch: no pc entry for address %p", codePtr);
return nullptr; return nullptr;
} }
u32 pc = it->second; u32 pc = it2->second;
u8 *start; u8 *start;
if (info.byteSwap || info.hasImmediate) if (info.byteSwap || info.hasImmediate)

View file

@ -137,7 +137,7 @@ template <typename T>
class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T> class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T>
{ {
public: public:
MMIOReadCodeGenerator(Gen::X64CodeBlock* code, u32 registers_in_use, MMIOReadCodeGenerator(Gen::X64CodeBlock* code, BitSet32 registers_in_use,
Gen::X64Reg dst_reg, u32 address, bool sign_extend) Gen::X64Reg dst_reg, u32 address, bool sign_extend)
: m_code(code), m_registers_in_use(registers_in_use), m_dst_reg(dst_reg), : m_code(code), m_registers_in_use(registers_in_use), m_dst_reg(dst_reg),
m_address(address), m_sign_extend(sign_extend) m_address(address), m_sign_extend(sign_extend)
@ -214,14 +214,14 @@ private:
} }
Gen::X64CodeBlock* m_code; Gen::X64CodeBlock* m_code;
u32 m_registers_in_use; BitSet32 m_registers_in_use;
Gen::X64Reg m_dst_reg; Gen::X64Reg m_dst_reg;
u32 m_address; u32 m_address;
bool m_sign_extend; bool m_sign_extend;
}; };
void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
u32 registers_in_use, u32 address, BitSet32 registers_in_use, u32 address,
int access_size, bool sign_extend) int access_size, bool sign_extend)
{ {
switch (access_size) switch (access_size)
@ -250,17 +250,17 @@ void EmuCodeBlock::MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value,
} }
} }
FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u32 registers_in_use, u32 mem_mask) FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask)
{ {
registers_in_use |= (1 << reg_addr); registers_in_use[reg_addr] = true;
if (reg_value.IsSimpleReg()) if (reg_value.IsSimpleReg())
registers_in_use |= (1 << reg_value.GetSimpleReg()); registers_in_use[reg_value.GetSimpleReg()] = true;
// Get ourselves a free register; try to pick one that doesn't involve pushing, if we can. // Get ourselves a free register; try to pick one that doesn't involve pushing, if we can.
X64Reg scratch = RSCRATCH; X64Reg scratch = RSCRATCH;
if (!(registers_in_use & (1 << RSCRATCH))) if (!registers_in_use[RSCRATCH])
scratch = RSCRATCH; scratch = RSCRATCH;
else if (!(registers_in_use & (1 << RSCRATCH_EXTRA))) else if (!registers_in_use[RSCRATCH_EXTRA])
scratch = RSCRATCH_EXTRA; scratch = RSCRATCH_EXTRA;
else else
scratch = reg_addr; scratch = reg_addr;
@ -290,11 +290,11 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, u
} }
} }
void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags) void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags)
{ {
if (!jit->js.memcheck) if (!jit->js.memcheck)
{ {
registersInUse &= ~(1 << reg_value); registersInUse[reg_value] = false;
} }
if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU && if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU &&
SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem &&
@ -461,7 +461,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce
return result; return result;
} }
void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags) void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags)
{ {
// set the correct immediate format // set the correct immediate format
if (reg_value.IsImm()) if (reg_value.IsImm())
@ -566,7 +566,7 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces
} }
// Destroys the same as SafeWrite plus RSCRATCH. TODO: see if we can avoid temporaries here // Destroys the same as SafeWrite plus RSCRATCH. TODO: see if we can avoid temporaries here
void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, u32 registersInUse, int flags) void EmuCodeBlock::SafeWriteF32ToReg(X64Reg xmm_value, X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags)
{ {
// TODO: PSHUFB might be faster if fastmem supported MOVSS. // TODO: PSHUFB might be faster if fastmem supported MOVSS.
MOVD_xmm(R(RSCRATCH), xmm_value); MOVD_xmm(R(RSCRATCH), xmm_value);

View file

@ -6,6 +6,7 @@
#include <unordered_map> #include <unordered_map>
#include "Common/BitSet.h"
#include "Common/CPUDetect.h" #include "Common/CPUDetect.h"
#include "Common/x64Emitter.h" #include "Common/x64Emitter.h"
@ -76,7 +77,7 @@ public:
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, u32 registers_in_use, u32 mem_mask); Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask);
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false);
// these return the address of the MOV, for backpatching // these return the address of the MOV, for backpatching
@ -89,7 +90,7 @@ public:
// Generate a load/write from the MMIO handler for a given address. Only // Generate a load/write from the MMIO handler for a given address. Only
// call for known addresses in MMIO range (MMIO::IsMMIOAddress). // call for known addresses in MMIO range (MMIO::IsMMIOAddress).
void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, u32 registers_in_use, u32 address, int access_size, bool sign_extend); void MMIOLoadToReg(MMIO::Mapping* mmio, Gen::X64Reg reg_value, BitSet32 registers_in_use, u32 address, int access_size, bool sign_extend);
enum SafeLoadStoreFlags enum SafeLoadStoreFlags
{ {
@ -99,12 +100,12 @@ public:
SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8 SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8
}; };
void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, u32 registersInUse, bool signExtend, int flags = 0); void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags = 0);
// Clobbers RSCRATCH or reg_addr depending on the relevant flag. Preserves // Clobbers RSCRATCH or reg_addr depending on the relevant flag. Preserves
// reg_value if the load fails and js.memcheck is enabled. // reg_value if the load fails and js.memcheck is enabled.
// Works with immediate inputs and simple registers only. // Works with immediate inputs and simple registers only.
void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0); void SafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0);
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, u32 registersInUse, int flags = 0) void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags = 0)
{ {
SafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, registersInUse, flags); SafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, registersInUse, flags);
} }
@ -115,7 +116,7 @@ public:
return swap && !cpu_info.bMOVBE && accessSize > 8; return swap && !cpu_info.bMOVBE && accessSize > 8;
} }
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, BitSet32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void JitGetAndClearCAOV(bool oe); void JitGetAndClearCAOV(bool oe);
@ -137,6 +138,6 @@ public:
void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
void SetFPRF(Gen::X64Reg xmm); void SetFPRF(Gen::X64Reg xmm);
protected: protected:
std::unordered_map<u8 *, u32> registersInUseAtLoc; std::unordered_map<u8 *, BitSet32> registersInUseAtLoc;
std::unordered_map<u8 *, u32> pcAtLoc; std::unordered_map<u8 *, u32> pcAtLoc;
}; };

View file

@ -36,7 +36,7 @@ void TrampolineCache::Shutdown()
cachedTrampolines.clear(); cachedTrampolines.clear();
} }
const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 registersInUse) const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse)
{ {
TrampolineCacheKey key = { registersInUse, 0, info }; TrampolineCacheKey key = { registersInUse, 0, info };
@ -49,7 +49,7 @@ const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
return trampoline; return trampoline;
} }
const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse) const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse)
{ {
if (GetSpaceLeft() < 1024) if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full"); PanicAlert("Trampoline cache full");
@ -97,7 +97,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, u
return trampoline; return trampoline;
} }
const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc) const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc)
{ {
TrampolineCacheKey key = { registersInUse, pc, info }; TrampolineCacheKey key = { registersInUse, pc, info };
@ -110,7 +110,7 @@ const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
return trampoline; return trampoline;
} }
const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc) const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc)
{ {
if (GetSpaceLeft() < 1024) if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full"); PanicAlert("Trampoline cache full");
@ -184,7 +184,7 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info,
size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const
{ {
size_t res = std::hash<int>()(k.registersInUse); size_t res = std::hash<int>()(k.registersInUse.m_val);
res ^= std::hash<int>()(k.info.operandSize) >> 1; res ^= std::hash<int>()(k.info.operandSize) >> 1;
res ^= std::hash<int>()(k.info.regOperandReg) >> 2; res ^= std::hash<int>()(k.info.regOperandReg) >> 2;
res ^= std::hash<int>()(k.info.scaledReg) >> 3; res ^= std::hash<int>()(k.info.scaledReg) >> 3;

View file

@ -6,6 +6,7 @@
#include <unordered_map> #include <unordered_map>
#include "Common/BitSet.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/x64Analyzer.h" #include "Common/x64Analyzer.h"
#include "Common/x64Emitter.h" #include "Common/x64Emitter.h"
@ -15,7 +16,7 @@ const int BACKPATCH_SIZE = 5;
struct TrampolineCacheKey struct TrampolineCacheKey
{ {
u32 registersInUse; BitSet32 registersInUse;
u32 pc; u32 pc;
InstructionInfo info; InstructionInfo info;
@ -33,13 +34,13 @@ public:
void Init(); void Init();
void Shutdown(); void Shutdown();
const u8* GetReadTrampoline(const InstructionInfo &info, u32 registersInUse); const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse);
const u8* GetWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc); const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc);
void ClearCodeSpace(); void ClearCodeSpace();
private: private:
const u8* GenerateReadTrampoline(const InstructionInfo &info, u32 registersInUse); const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse);
const u8* GenerateWriteTrampoline(const InstructionInfo &info, u32 registersInUse, u32 pc); const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc);
std::unordered_map<TrampolineCacheKey, const u8*, TrampolineCacheKeyHasher> cachedTrampolines; std::unordered_map<TrampolineCacheKey, const u8*, TrampolineCacheKeyHasher> cachedTrampolines;
}; };

View file

@ -249,21 +249,15 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b)
// That is, check that none of b's outputs matches any of a's inputs, // That is, check that none of b's outputs matches any of a's inputs,
// and that none of a's outputs matches any of b's inputs. // and that none of a's outputs matches any of b's inputs.
// The latter does not apply if a is a cmp, of course, but doesn't hurt to check. // The latter does not apply if a is a cmp, of course, but doesn't hurt to check.
for (int j = 0; j < 3; j++)
{
int regInA = a.regsIn[j];
int regInB = b.regsIn[j];
// register collision: b outputs to one of a's inputs // register collision: b outputs to one of a's inputs
if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA)) if (b.regsOut & a.regsIn)
return false; return false;
// register collision: a outputs to one of b's inputs // register collision: a outputs to one of b's inputs
if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB)) if (a.regsOut & b.regsIn)
return false; return false;
// register collision: b outputs to one of a's outputs (overwriting it) // register collision: b outputs to one of a's outputs (overwriting it)
for (int k = 0; k < 2; k++) if (b.regsOut & a.regsOut)
if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1]))
return false; return false;
}
return true; return true;
} }
@ -520,42 +514,41 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
int numOut = 0; code->regsIn = BitSet32(0);
int numIn = 0; code->regsOut = BitSet32(0);
int numFloatIn = 0;
if (opinfo->flags & FL_OUT_A) if (opinfo->flags & FL_OUT_A)
{ {
code->regsOut[numOut++] = code->inst.RA; code->regsOut[code->inst.RA] = true;
block->m_gpa->SetOutputRegister(code->inst.RA, index); block->m_gpa->SetOutputRegister(code->inst.RA, index);
} }
if (opinfo->flags & FL_OUT_D) if (opinfo->flags & FL_OUT_D)
{ {
code->regsOut[numOut++] = code->inst.RD; code->regsOut[code->inst.RD] = true;
block->m_gpa->SetOutputRegister(code->inst.RD, index); block->m_gpa->SetOutputRegister(code->inst.RD, index);
} }
if (opinfo->flags & FL_OUT_S) if (opinfo->flags & FL_OUT_S)
{ {
code->regsOut[numOut++] = code->inst.RS; code->regsOut[code->inst.RS] = true;
block->m_gpa->SetOutputRegister(code->inst.RS, index); block->m_gpa->SetOutputRegister(code->inst.RS, index);
} }
if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0)) if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0))
{ {
code->regsIn[numIn++] = code->inst.RA; code->regsIn[code->inst.RA] = true;
block->m_gpa->SetInputRegister(code->inst.RA, index); block->m_gpa->SetInputRegister(code->inst.RA, index);
} }
if (opinfo->flags & FL_IN_B) if (opinfo->flags & FL_IN_B)
{ {
code->regsIn[numIn++] = code->inst.RB; code->regsIn[code->inst.RB] = true;
block->m_gpa->SetInputRegister(code->inst.RB, index); block->m_gpa->SetInputRegister(code->inst.RB, index);
} }
if (opinfo->flags & FL_IN_C) if (opinfo->flags & FL_IN_C)
{ {
code->regsIn[numIn++] = code->inst.RC; code->regsIn[code->inst.RC] = true;
block->m_gpa->SetInputRegister(code->inst.RC, index); block->m_gpa->SetInputRegister(code->inst.RC, index);
} }
if (opinfo->flags & FL_IN_S) if (opinfo->flags & FL_IN_S)
{ {
code->regsIn[numIn++] = code->inst.RS; code->regsIn[code->inst.RS] = true;
block->m_gpa->SetInputRegister(code->inst.RS, index); block->m_gpa->SetInputRegister(code->inst.RS, index);
} }
@ -564,24 +557,17 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
code->fregOut = code->inst.FD; code->fregOut = code->inst.FD;
else if (opinfo->flags & FL_OUT_FLOAT_S) else if (opinfo->flags & FL_OUT_FLOAT_S)
code->fregOut = code->inst.FS; code->fregOut = code->inst.FS;
code->fregsIn = BitSet32(0);
if (opinfo->flags & FL_IN_FLOAT_A) if (opinfo->flags & FL_IN_FLOAT_A)
code->fregsIn[numFloatIn++] = code->inst.FA; code->fregsIn[code->inst.FA] = true;
if (opinfo->flags & FL_IN_FLOAT_B) if (opinfo->flags & FL_IN_FLOAT_B)
code->fregsIn[numFloatIn++] = code->inst.FB; code->fregsIn[code->inst.FB] = true;
if (opinfo->flags & FL_IN_FLOAT_C) if (opinfo->flags & FL_IN_FLOAT_C)
code->fregsIn[numFloatIn++] = code->inst.FC; code->fregsIn[code->inst.FC] = true;
if (opinfo->flags & FL_IN_FLOAT_D) if (opinfo->flags & FL_IN_FLOAT_D)
code->fregsIn[numFloatIn++] = code->inst.FD; code->fregsIn[code->inst.FD] = true;
if (opinfo->flags & FL_IN_FLOAT_S) if (opinfo->flags & FL_IN_FLOAT_S)
code->fregsIn[numFloatIn++] = code->inst.FS; code->fregsIn[code->inst.FS] = true;
// Set remaining register slots as unused (-1)
for (int j = numIn; j < 3; j++)
code->regsIn[j] = -1;
for (int j = numOut; j < 2; j++)
code->regsOut[j] = -1;
for (int j = numFloatIn; j < 4; j++)
code->fregsIn[j] = -1;
switch (opinfo->type) switch (opinfo->type)
{ {
@ -797,7 +783,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
// Scan for flag dependencies; assume the next block (or any branch that can leave the block) // Scan for flag dependencies; assume the next block (or any branch that can leave the block)
// wants flags, to be safe. // wants flags, to be safe.
bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true; bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true;
u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0; BitSet32 fprInUse, gprInUse, gprInReg, fprInXmm;
for (int i = block->m_num_instructions - 1; i >= 0; i--) for (int i = block->m_num_instructions - 1; i >= 0; i--)
{ {
bool opWantsCR0 = code[i].wantsCR0; bool opWantsCR0 = code[i].wantsCR0;
@ -822,30 +808,20 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
code[i].fprInXmm = fprInXmm; code[i].fprInXmm = fprInXmm;
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache // TODO: if there's no possible endblocks or exceptions in between, tell the regcache
// we can throw away a register if it's going to be overwritten later. // we can throw away a register if it's going to be overwritten later.
for (int j = 0; j < 3; j++) gprInUse |= code[i].regsIn;
if (code[i].regsIn[j] >= 0) gprInReg |= code[i].regsIn;
{ fprInUse |= code[i].fregsIn;
gprInUse |= 1 << code[i].regsIn[j];
gprInReg |= 1 << code[i].regsIn[j];
}
for (int j = 0; j < 4; j++)
if (code[i].fregsIn[j] >= 0)
{
fprInUse |= 1 << code[i].fregsIn[j];
if (strncmp(code[i].opinfo->opname, "stfd", 4)) if (strncmp(code[i].opinfo->opname, "stfd", 4))
fprInXmm |= 1 << code[i].fregsIn[j]; fprInXmm |= code[i].fregsIn;
}
// For now, we need to count output registers as "used" though; otherwise the flush // For now, we need to count output registers as "used" though; otherwise the flush
// will result in a redundant store (e.g. store to regcache, then store again to // will result in a redundant store (e.g. store to regcache, then store again to
// the same location later). // the same location later).
for (int j = 0; j < 2; j++) gprInUse |= code[i].regsOut;
if (code[i].regsOut[j] >= 0)
gprInUse |= 1 << code[i].regsOut[j];
if (code[i].fregOut >= 0) if (code[i].fregOut >= 0)
{ {
fprInUse |= 1 << code[i].fregOut; fprInUse[code[i].fregOut] = true;
if (strncmp(code[i].opinfo->opname, "stfd", 4)) if (strncmp(code[i].opinfo->opname, "stfd", 4))
fprInXmm |= 1 << code[i].fregOut; fprInXmm[code[i].fregOut] = true;
} }
} }
return address; return address;

View file

@ -10,6 +10,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "Common/BitSet.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Core/PowerPC/PPCTables.h" #include "Core/PowerPC/PPCTables.h"
@ -26,10 +27,10 @@ struct CodeOp //16B
u32 address; u32 address;
u32 branchTo; //if 0, not a branch u32 branchTo; //if 0, not a branch
int branchToIndex; //index of target block int branchToIndex; //index of target block
s8 regsOut[2]; BitSet32 regsOut;
s8 regsIn[3]; BitSet32 regsIn;
BitSet32 fregsIn;
s8 fregOut; s8 fregOut;
s8 fregsIn[4];
bool isBranchTarget; bool isBranchTarget;
bool wantsCR0; bool wantsCR0;
bool wantsCR1; bool wantsCR1;
@ -43,13 +44,13 @@ struct CodeOp //16B
bool canEndBlock; bool canEndBlock;
bool skip; // followed BL-s for example bool skip; // followed BL-s for example
// which registers are still needed after this instruction in this block // which registers are still needed after this instruction in this block
u32 fprInUse; BitSet32 fprInUse;
u32 gprInUse; BitSet32 gprInUse;
// just because a register is in use doesn't mean we actually need or want it in an x86 register. // just because a register is in use doesn't mean we actually need or want it in an x86 register.
u32 gprInReg; BitSet32 gprInReg;
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
// an XMM only to move it again to a GPR afterwards. // an XMM only to move it again to a GPR afterwards.
u32 fprInXmm; BitSet32 fprInXmm;
}; };
struct BlockStats struct BlockStats

View file

@ -23,7 +23,7 @@
MOV(64, M(pdt), R(RSCRATCH)); MOV(64, M(pdt), R(RSCRATCH));
#define PROFILER_VPUSH \ #define PROFILER_VPUSH \
u32 registersInUse = CallerSavedRegistersInUse(); \ BitSet32 registersInUse = CallerSavedRegistersInUse(); \
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
#define PROFILER_VPOP \ #define PROFILER_VPOP \

View file

@ -4,6 +4,7 @@
#pragma once #pragma once
#include "Common/BitSet.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
// Vertex array numbers // Vertex array numbers
@ -252,7 +253,7 @@ struct CPState final
VAT vtx_attr[8]; VAT vtx_attr[8];
// Attributes that actually belong to VertexLoaderManager: // Attributes that actually belong to VertexLoaderManager:
int attr_dirty; // bitfield BitSet32 attr_dirty;
VertexLoader* vertex_loaders[8]; VertexLoader* vertex_loaders[8];
}; };

View file

@ -141,7 +141,7 @@ void VertexLoader::CompileVertexTranslator()
m_compiledCode = GetCodePtr(); m_compiledCode = GetCodePtr();
// We only use RAX (caller saved) and RBX (callee saved). // We only use RAX (caller saved) and RBX (callee saved).
ABI_PushRegistersAndAdjustStack(1 << RBX, 8); ABI_PushRegistersAndAdjustStack({RBX}, 8);
// save count // save count
MOV(64, R(RBX), R(ABI_PARAM1)); MOV(64, R(RBX), R(ABI_PARAM1));
@ -402,7 +402,7 @@ void VertexLoader::CompileVertexTranslator()
SUB(64, R(RBX), Imm8(1)); SUB(64, R(RBX), Imm8(1));
J_CC(CC_NZ, loop_start); J_CC(CC_NZ, loop_start);
ABI_PopRegistersAndAdjustStack(1 << RBX, 8); ABI_PopRegistersAndAdjustStack({RBX}, 8);
RET(); RET();
#endif #endif
} }

View file

@ -100,14 +100,14 @@ void AppendListToString(std::string *dest)
void MarkAllDirty() void MarkAllDirty()
{ {
g_main_cp_state.attr_dirty = 0xff; g_main_cp_state.attr_dirty = BitSet32::AllTrue(8);
g_preprocess_cp_state.attr_dirty = 0xff; g_preprocess_cp_state.attr_dirty = BitSet32::AllTrue(8);
} }
static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state) static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state)
{ {
VertexLoader* loader; VertexLoader* loader;
if ((state->attr_dirty >> vtx_attr_group) & 1) if (state->attr_dirty[vtx_attr_group])
{ {
VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]); VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]);
std::lock_guard<std::mutex> lk(s_vertex_loader_map_lock); std::lock_guard<std::mutex> lk(s_vertex_loader_map_lock);
@ -123,7 +123,7 @@ static VertexLoader* RefreshLoader(int vtx_attr_group, CPState* state)
INCSTAT(stats.numVertexLoaders); INCSTAT(stats.numVertexLoaders);
} }
state->vertex_loaders[vtx_attr_group] = loader; state->vertex_loaders[vtx_attr_group] = loader;
state->attr_dirty &= ~(1 << vtx_attr_group); state->attr_dirty[vtx_attr_group] = false;
} else { } else {
loader = state->vertex_loaders[vtx_attr_group]; loader = state->vertex_loaders[vtx_attr_group];
} }
@ -200,31 +200,31 @@ void LoadCPReg(u32 sub_cmd, u32 value, bool is_preprocess)
case 0x50: case 0x50:
state->vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits state->vtx_desc.Hex &= ~0x1FFFF; // keep the Upper bits
state->vtx_desc.Hex |= value; state->vtx_desc.Hex |= value;
state->attr_dirty = 0xFF; state->attr_dirty = BitSet32::AllTrue(8);
break; break;
case 0x60: case 0x60:
state->vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits state->vtx_desc.Hex &= 0x1FFFF; // keep the lower 17Bits
state->vtx_desc.Hex |= (u64)value << 17; state->vtx_desc.Hex |= (u64)value << 17;
state->attr_dirty = 0xFF; state->attr_dirty = BitSet32::AllTrue(8);
break; break;
case 0x70: case 0x70:
_assert_((sub_cmd & 0x0F) < 8); _assert_((sub_cmd & 0x0F) < 8);
state->vtx_attr[sub_cmd & 7].g0.Hex = value; state->vtx_attr[sub_cmd & 7].g0.Hex = value;
state->attr_dirty |= 1 << (sub_cmd & 7); state->attr_dirty[sub_cmd & 7] = true;
break; break;
case 0x80: case 0x80:
_assert_((sub_cmd & 0x0F) < 8); _assert_((sub_cmd & 0x0F) < 8);
state->vtx_attr[sub_cmd & 7].g1.Hex = value; state->vtx_attr[sub_cmd & 7].g1.Hex = value;
state->attr_dirty |= 1 << (sub_cmd & 7); state->attr_dirty[sub_cmd & 7] = true;
break; break;
case 0x90: case 0x90:
_assert_((sub_cmd & 0x0F) < 8); _assert_((sub_cmd & 0x0F) < 8);
state->vtx_attr[sub_cmd & 7].g2.Hex = value; state->vtx_attr[sub_cmd & 7].g2.Hex = value;
state->attr_dirty |= 1 << (sub_cmd & 7); state->attr_dirty[sub_cmd & 7] = true;
break; break;
// Pointers to vertex arrays in GC RAM // Pointers to vertex arrays in GC RAM

View file

@ -180,19 +180,17 @@ void VertexManager::Flush()
(int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, (bpmem.alpha_test.hex>>16)&0xff); (int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, (bpmem.alpha_test.hex>>16)&0xff);
#endif #endif
u32 usedtextures = 0; BitSet32 usedtextures;
for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) for (u32 i = 0; i < bpmem.genMode.numtevstages + 1u; ++i)
if (bpmem.tevorders[i / 2].getEnable(i & 1)) if (bpmem.tevorders[i / 2].getEnable(i & 1))
usedtextures |= 1 << bpmem.tevorders[i/2].getTexMap(i & 1); usedtextures[bpmem.tevorders[i/2].getTexMap(i & 1)] = true;
if (bpmem.genMode.numindstages > 0) if (bpmem.genMode.numindstages > 0)
for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i) for (unsigned int i = 0; i < bpmem.genMode.numtevstages + 1u; ++i)
if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages) if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages)
usedtextures |= 1 << bpmem.tevindref.getTexMap(bpmem.tevind[i].bt); usedtextures[bpmem.tevindref.getTexMap(bpmem.tevind[i].bt)] = true;
for (unsigned int i = 0; i < 8; i++) for (unsigned int i : usedtextures)
{
if (usedtextures & (1 << i))
{ {
g_renderer->SetSamplerState(i & 3, i >> 2); g_renderer->SetSamplerState(i & 3, i >> 2);
const FourTexUnits &tex = bpmem.tex[i >> 2]; const FourTexUnits &tex = bpmem.tex[i >> 2];
@ -213,7 +211,6 @@ void VertexManager::Flush()
else else
ERROR_LOG(VIDEO, "error loading texture"); ERROR_LOG(VIDEO, "error loading texture");
} }
}
// set global constants // set global constants
VertexShaderManager::SetConstants(); VertexShaderManager::SetConstants();

View file

@ -5,6 +5,7 @@
#include <cmath> #include <cmath>
#include <sstream> #include <sstream>
#include "Common/BitSet.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/MathUtil.h" #include "Common/MathUtil.h"
#include "VideoCommon/BPMemory.h" #include "VideoCommon/BPMemory.h"
@ -22,7 +23,7 @@ static float GC_ALIGNED16(g_fProjectionMatrix[16]);
// track changes // track changes
static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged; static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged;
static int nMaterialsChanged; static BitSet32 nMaterialsChanged;
static int nTransformMatricesChanged[2]; // min,max static int nTransformMatricesChanged[2]; // min,max
static int nNormalMatricesChanged[2]; // min,max static int nNormalMatricesChanged[2]; // min,max
static int nPostTransformMatricesChanged[2]; // min,max static int nPostTransformMatricesChanged[2]; // min,max
@ -202,7 +203,7 @@ void VertexShaderManager::Dirty()
bProjectionChanged = true; bProjectionChanged = true;
nMaterialsChanged = 15; nMaterialsChanged = BitSet32::AllTrue(4);
dirty = true; dirty = true;
} }
@ -295,35 +296,16 @@ void VertexShaderManager::SetConstants()
nLightsChanged[0] = nLightsChanged[1] = -1; nLightsChanged[0] = nLightsChanged[1] = -1;
} }
if (nMaterialsChanged) for (int i : nMaterialsChanged)
{ {
for (int i = 0; i < 2; ++i) u32 data = i >= 2 ? xfmem.matColor[i - 2] : xfmem.ambColor[i];
{
if (nMaterialsChanged & (1 << i))
{
u32 data = xfmem.ambColor[i];
constants.materials[i][0] = (data >> 24) & 0xFF; constants.materials[i][0] = (data >> 24) & 0xFF;
constants.materials[i][1] = (data >> 16) & 0xFF; constants.materials[i][1] = (data >> 16) & 0xFF;
constants.materials[i][2] = (data >> 8) & 0xFF; constants.materials[i][2] = (data >> 8) & 0xFF;
constants.materials[i][3] = data & 0xFF; constants.materials[i][3] = data & 0xFF;
}
}
for (int i = 0; i < 2; ++i)
{
if (nMaterialsChanged & (1 << (i + 2)))
{
u32 data = xfmem.matColor[i];
constants.materials[i+2][0] = (data >> 24) & 0xFF;
constants.materials[i+2][1] = (data >> 16) & 0xFF;
constants.materials[i+2][2] = (data >> 8) & 0xFF;
constants.materials[i+2][3] = data & 0xFF;
}
}
dirty = true; dirty = true;
nMaterialsChanged = 0;
} }
nMaterialsChanged = BitSet32(0);
if (bPosNormalMatrixChanged) if (bPosNormalMatrixChanged)
{ {
@ -660,7 +642,7 @@ void VertexShaderManager::SetProjectionChanged()
void VertexShaderManager::SetMaterialColorChanged(int index, u32 color) void VertexShaderManager::SetMaterialColorChanged(int index, u32 color)
{ {
nMaterialsChanged |= (1 << index); nMaterialsChanged[index] = true;
} }
void VertexShaderManager::TranslateView(float x, float y, float z) void VertexShaderManager::TranslateView(float x, float y, float z)

View file

@ -0,0 +1,84 @@
// Copyright 2014 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#include <gtest/gtest.h>
#include "Common/BitSet.h"
TEST(BitSet, Basics)
{
BitSet32 bs;
BitSet64 bs2(1);
BitSet64 bs3(2);
EXPECT_EQ(true, !!bs2);
EXPECT_EQ(false, !!bs);
EXPECT_EQ(bs2, bs2);
EXPECT_NE(bs2, bs3);
EXPECT_EQ(BitSet32(0xfff), BitSet32::AllTrue(12));
EXPECT_EQ(BitSet64(0xffffffffffffffff), BitSet64::AllTrue(64));
}
TEST(BitSet, BitGetSet)
{
BitSet32 bs;
bs[3] = bs[8] = bs[11] = true;
EXPECT_EQ(true, bs[3]);
EXPECT_EQ(false, bs[4]);
EXPECT_EQ((u32)((1 << 3) | (1 << 8) | (1 << 11)), bs.m_val);
}
TEST(BitSet, Count)
{
u32 random_numbers[] = {
0x2cb0b5f3, 0x81ab32a6, 0xd9030dc5, 0x325ffe26, 0xb2fcaee3,
0x4ccf188a, 0xf8be36dc, 0xb2fcecd5, 0xb750c2e5, 0x31d19074,
0xf267644a, 0xac00a719, 0x6d45f19b, 0xf7e91c5b, 0xf687e694,
0x9057c24e, 0x5eb65c39, 0x85d3038b, 0x101f4e66, 0xc202d136
};
u32 counts[] = {
17, 14, 14, 19, 20, 14, 20, 20, 16, 13, 16, 12, 18, 20, 18, 14, 18, 14, 14, 12
};
for (size_t i = 0; i < 20; i++)
{
EXPECT_EQ(counts[i], BitSet32(random_numbers[i]).Count());
}
u64 random_numbers_64[] = {
0xf86cd6f6ef09d7d4ULL, 0x6f2d8533255ead3cULL, 0x9da7941e0e52b345ULL,
0x06e4189be67d2b17ULL, 0x3eb0681f65cb6d25ULL, 0xccab8a7c74a51203ULL,
0x09d470516694c64bULL, 0x38cd077e075c778fULL, 0xd69ebfa6355ebfdeULL
};
u32 counts_64[] = {
39, 34, 31, 32, 33, 29, 27, 35, 43
};
for (size_t i = 0; i < 9; i++)
{
EXPECT_EQ(counts_64[i], BitSet64(random_numbers_64[i]).Count());
}
}
TEST(BitSet, BitOps)
{
BitSet32 a(3), b(5), c;
EXPECT_EQ(BitSet32(7), a | b);
EXPECT_EQ(BitSet32(6), a ^ b);
EXPECT_EQ(BitSet32(1), a & b);
EXPECT_EQ(BitSet32(0xfffffffc), ~a);
c = a; c |= b; EXPECT_EQ(BitSet32(7), c);
c = a; c ^= b; EXPECT_EQ(BitSet32(6), c);
c = a; c &= b; EXPECT_EQ(BitSet32(1), c);
}
TEST(BitSet, InitializerListsAndIteration)
{
std::vector<int> bits { 1, 10, 15, 17, 20, 30 };
BitSet32 bs { 1, 10, 15, 17, 20, 30 };
auto vit = bits.begin();
for (auto i : bs)
{
EXPECT_NE(vit, bits.end());
EXPECT_EQ(i, *vit++);
}
EXPECT_EQ(vit, bits.end());
}

View file

@ -1,4 +1,5 @@
add_dolphin_test(BitFieldTest BitFieldTest.cpp) add_dolphin_test(BitFieldTest BitFieldTest.cpp)
add_dolphin_test(BitSetTest BitSetTest.cpp)
add_dolphin_test(CommonFuncsTest CommonFuncsTest.cpp) add_dolphin_test(CommonFuncsTest CommonFuncsTest.cpp)
add_dolphin_test(EventTest EventTest.cpp) add_dolphin_test(EventTest EventTest.cpp)
add_dolphin_test(FifoQueueTest FifoQueueTest.cpp) add_dolphin_test(FifoQueueTest FifoQueueTest.cpp)