SPU/PPU atomics performance and LR event fixes (#5435)

* Fix SPU LR event setting in atomic commands according to hw test
* MFC: increment timestamp for PUT cmd in non-tsx path
* MFC: fix reservation lost test on non-tsx path in regard to the lock bit
* Reservation notification moved out of writer_lock scope to reduce its lifetime
* Use passive_lock/unlock in ppu atomic inctrustions to reduce redundancy
* Lock only once for dma transfers (non-TSX)
* Don't use RDTSC in reservation update logic
* Remove MFC cmd args passing to process_mfc_cmd
* Reorder check_state cpu_flag::memory check for faster unlocking
* Specialization for 128-byte data copy in SPU dma transfers
* Implement memory range locks and isolate PPU and SPU passive lock logic
This commit is contained in:
elad 2019-01-15 17:31:21 +02:00 committed by Ivan
parent f19fd23227
commit fc92ae4085
9 changed files with 344 additions and 235 deletions

View file

@ -119,15 +119,16 @@ bool cpu_thread::check_state()
while (true)
{
if (state & cpu_flag::memory && state.test_and_reset(cpu_flag::memory))
if (state & cpu_flag::memory)
{
cpu_flag_memory = true;
if (auto& ptr = vm::g_tls_locked)
{
ptr->compare_and_swap(this, nullptr);
ptr = nullptr;
}
cpu_flag_memory = true;
state -= cpu_flag::memory;
}
if (state & cpu_flag::exit + cpu_flag::dbg_global_stop)

View file

@ -977,7 +977,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
}
}
vm::temporary_unlock(ppu);
vm::passive_unlock(ppu);
for (u64 i = 0;; i++)
{
@ -1003,8 +1003,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
}
}
ppu.cpu_mem();
vm::passive_lock(ppu);
return static_cast<T>(ppu.rdata << data_off >> size_off);
}
@ -1044,7 +1043,7 @@ const auto ppu_stwcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
c.jne(fail);
c.mov(x86::dword_ptr(x86::r11), args[3].r32());
c.add(x86::qword_ptr(x86::r10), 1);
c.add(x86::qword_ptr(x86::r10), 2);
c.xend();
c.mov(x86::eax, 1);
c.ret();
@ -1070,7 +1069,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
auto& data = vm::_ref<atomic_be_t<u32>>(addr & -4);
const u32 old_data = static_cast<u32>(ppu.rdata << ((addr & 7) * 8) >> 32);
if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u32)))
if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u32)) & ~1ull))
{
ppu.raddr = 0;
return false;
@ -1090,7 +1089,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
return false;
}
vm::temporary_unlock(ppu);
vm::passive_unlock(ppu);
auto& res = vm::reservation_lock(addr, sizeof(u32));
@ -1098,7 +1097,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
if (result)
{
vm::reservation_update(addr, sizeof(u32));
res++;
vm::reservation_notifier(addr, sizeof(u32)).notify_all();
}
else
@ -1106,7 +1105,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
res &= ~1ull;
}
ppu.cpu_mem();
vm::passive_lock(ppu);
ppu.raddr = 0;
return result;
}
@ -1137,7 +1136,7 @@ const auto ppu_stdcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
c.cmp(x86::qword_ptr(x86::r11), args[2]);
c.jne(fail);
c.mov(x86::qword_ptr(x86::r11), args[3]);
c.add(x86::qword_ptr(x86::r10), 1);
c.add(x86::qword_ptr(x86::r10), 2);
c.xend();
c.mov(x86::eax, 1);
c.ret();
@ -1163,7 +1162,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
auto& data = vm::_ref<atomic_be_t<u64>>(addr & -8);
const u64 old_data = ppu.rdata << ((addr & 7) * 8);
if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u64)))
if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u64)) & ~1ull))
{
ppu.raddr = 0;
return false;
@ -1183,7 +1182,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
vm::temporary_unlock(ppu);
vm::passive_unlock(ppu);
auto& res = vm::reservation_lock(addr, sizeof(u64));
@ -1191,7 +1190,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
if (result)
{
vm::reservation_update(addr, sizeof(u64));
res++;
vm::reservation_notifier(addr, sizeof(u64)).notify_all();
}
else
@ -1199,7 +1198,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
res &= ~1ull;
}
ppu.cpu_mem();
vm::passive_lock(ppu);
ppu.raddr = 0;
return result;
}

View file

@ -1436,6 +1436,7 @@ void spu_recompiler::get_events()
c->mov(*qw0, imm_ptr(vm::g_reservations));
c->shr(qw1->r32(), 4);
c->mov(*qw0, x86::qword_ptr(*qw0, *qw1));
c->and_(qw0->r64(), (u64)(~1ull));
c->cmp(*qw0, SPU_OFF_64(rtime));
c->jne(fail);
c->mov(*qw0, imm_ptr(vm::g_base_addr));
@ -2596,7 +2597,7 @@ static void spu_wrch(spu_thread* _spu, u32 ch, u32 value, spu_function_t _ret)
static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret)
{
if (!_spu->process_mfc_cmd(_spu->ch_mfc_cmd))
if (!_spu->process_mfc_cmd())
{
_ret = &spu_wrch_ret;
}

View file

@ -3362,7 +3362,7 @@ public:
static bool exec_mfc_cmd(spu_thread* _spu)
{
return _spu->process_mfc_cmd(_spu->ch_mfc_cmd);
return _spu->process_mfc_cmd();
}
void WRCH(spu_opcode_t op) //
@ -3541,9 +3541,9 @@ public:
csize = ci->getZExtValue();
}
if (cmd >= MFC_SNDSIG_CMD)
if (cmd >= MFC_SNDSIG_CMD && csize != 4)
{
csize = 4;
csize = -1;
}
llvm::Value* src = m_ir->CreateGEP(m_lsptr, zext<u64>(lsa).value);

View file

@ -40,6 +40,34 @@ bool operator ==(const u128& lhs, const u128& rhs)
}
#endif
static FORCE_INLINE void mov_rdata(u128* const dst, const u128* const src)
{
{
const u128 data0 = src[0];
const u128 data1 = src[1];
const u128 data2 = src[2];
dst[0] = data0;
dst[1] = data1;
dst[2] = data2;
}
{
const u128 data0 = src[3];
const u128 data1 = src[4];
const u128 data2 = src[5];
dst[3] = data0;
dst[4] = data1;
dst[5] = data2;
}
{
const u128 data0 = src[6];
const u128 data1 = src[7];
dst[6] = data0;
dst[7] = data1;
}
};
extern u64 get_timebased_time();
extern u64 get_system_time();
@ -158,12 +186,13 @@ namespace spu
}
}
const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
Label retry = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
@ -216,7 +245,7 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9);
#endif
c.add(x86::qword_ptr(x86::r10), 1);
c.add(x86::qword_ptr(x86::r10), 2);
c.xend();
c.vzeroupper();
c.mov(x86::eax, 1);
@ -224,10 +253,10 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
// Touch memory after transaction failure
c.bind(fall);
c.sub(args[0].r32(), 1);
c.jz(fail);
c.sar(x86::eax, 24);
c.js(fail);
c.sub(args[0].r32(), 1);
c.jz(retry);
c.lock().add(x86::qword_ptr(x86::r11), 0);
c.lock().add(x86::qword_ptr(x86::r10), 0);
#ifdef _WIN32
@ -240,9 +269,12 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
build_transaction_abort(c, 0xff);
c.xor_(x86::eax, x86::eax);
c.ret();
c.bind(retry);
c.mov(x86::eax, 2);
c.ret();
});
const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64* out_rtime)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -271,8 +303,6 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2);
c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3);
c.vzeroupper();
c.mov(x86::qword_ptr(args[2]), x86::rax);
c.mov(x86::eax, 1);
c.ret();
// Touch memory after transaction failure
@ -282,7 +312,7 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.sub(args[0], 1);
c.jnz(begin);
c.xor_(x86::eax, x86::eax);
c.mov(x86::eax, 1);
c.ret();
});
@ -314,7 +344,7 @@ const auto spu_putlluc_tx = build_function_asm<bool(*)(u32 raddr, const void* rd
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
c.add(x86::qword_ptr(x86::r10), 1);
c.add(x86::qword_ptr(x86::r10), 2);
c.xend();
c.vzeroupper();
c.mov(x86::eax, 1);
@ -767,8 +797,8 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
}
void* dst = vm::base(eal);
void* src = vm::base(offset + lsa);
u8* dst = (u8*)vm::base(eal);
u8* src = (u8*)vm::base(offset + lsa);
if (UNLIKELY(!is_get && !g_use_rtm))
{
@ -777,65 +807,72 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
case 1:
{
auto& res = vm::reservation_lock(eal, 1);
*static_cast<u8*>(dst) = *static_cast<const u8*>(src);
res &= ~1ull;
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res++;
break;
}
case 2:
{
auto& res = vm::reservation_lock(eal, 2);
*static_cast<u16*>(dst) = *static_cast<const u16*>(src);
res &= ~1ull;
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res++;
break;
}
case 4:
{
auto& res = vm::reservation_lock(eal, 4);
*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
res &= ~1ull;
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res++;
break;
}
case 8:
{
auto& res = vm::reservation_lock(eal, 8);
*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
res &= ~1ull;
break;
}
case 16:
{
auto& res = vm::reservation_lock(eal, 16);
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
res &= ~1ull;
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res++;
break;
}
default:
{
auto* res = &vm::reservation_lock(eal, 16);
auto vdst = static_cast<__m128i*>(dst);
auto vsrc = static_cast<const __m128i*>(src);
for (u32 addr = eal, end = eal + size;; vdst++, vsrc++)
if (((eal & 127) + size) <= 128)
{
_mm_store_si128(vdst, _mm_load_si128(vsrc));
// Lock one cache line
auto& res = vm::reservation_lock(eal, 128);
addr += 16;
if (addr == end)
while (size)
{
break;
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
}
if (addr % 128)
{
continue;
}
res->fetch_and(~1ull);
res = &vm::reservation_lock(addr, 16);
res++;
break;
}
res->fetch_and(~1ull);
auto lock = vm::passive_lock(eal & -128u, ::align(eal + size, 128));
while (size >= 128)
{
mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
}
*lock = 0;
break;
}
}
@ -852,67 +889,44 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
case 1:
{
*static_cast<u8*>(dst) = *static_cast<const u8*>(src);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*static_cast<u16*>(dst) = *static_cast<const u16*>(src);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
break;
}
case 16:
{
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
default:
{
auto vdst = static_cast<__m128i*>(dst);
auto vsrc = static_cast<const __m128i*>(src);
auto vcnt = size / sizeof(__m128i);
while (vcnt >= 8)
while (size >= 128)
{
const __m128i data[]
{
_mm_load_si128(vsrc + 0),
_mm_load_si128(vsrc + 1),
_mm_load_si128(vsrc + 2),
_mm_load_si128(vsrc + 3),
_mm_load_si128(vsrc + 4),
_mm_load_si128(vsrc + 5),
_mm_load_si128(vsrc + 6),
_mm_load_si128(vsrc + 7),
};
mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
_mm_store_si128(vdst + 0, data[0]);
_mm_store_si128(vdst + 1, data[1]);
_mm_store_si128(vdst + 2, data[2]);
_mm_store_si128(vdst + 3, data[3]);
_mm_store_si128(vdst + 4, data[4]);
_mm_store_si128(vdst + 5, data[5]);
_mm_store_si128(vdst + 6, data[6]);
_mm_store_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
vdst += 8;
dst += 128;
src += 128;
size -= 128;
}
while (vcnt--)
while (size)
{
_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
}
break;
}
}
@ -1030,7 +1044,12 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
if (raddr && addr == raddr)
{
ch_event_stat |= SPU_EVENT_LR;
// Last check for event before we clear the reservation
if ((vm::reservation_acquire(addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(addr))
{
ch_event_stat |= SPU_EVENT_LR;
}
raddr = 0;
}
@ -1057,20 +1076,20 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
auto& data = vm::_ref<decltype(rdata)>(addr);
auto& res = vm::reservation_lock(addr, 128);
vm::_ref<atomic_t<u32>>(addr) += 0;
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
if (g_cfg.core.spu_accurate_putlluc)
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
data = to_write;
vm::reservation_update(addr, 128);
vm::writer_lock lock(addr);
mov_rdata(data.data(), to_write.data());
res++;
}
else
{
data = to_write;
vm::reservation_update(addr, 128);
mov_rdata(data.data(), to_write.data());
res++;
}
}
@ -1140,11 +1159,7 @@ void spu_thread::do_mfc(bool wait)
return false;
}
if (args.size)
{
do_dma_transfer(args);
}
else if (args.cmd == MFC_PUTQLLUC_CMD)
if (args.cmd == MFC_PUTQLLUC_CMD)
{
if (fence & mask)
{
@ -1153,6 +1168,10 @@ void spu_thread::do_mfc(bool wait)
do_putlluc(args);
}
else if (args.size)
{
do_dma_transfer(args);
}
removed++;
return true;
@ -1184,7 +1203,7 @@ u32 spu_thread::get_mfc_completed()
return ch_tag_mask & ~mfc_fence;
}
bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
bool spu_thread::process_mfc_cmd()
{
// Stall infinitely if MFC queue is full
while (UNLIKELY(mfc_size >= 16))
@ -1198,29 +1217,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
}
spu::scheduler::concurrent_execution_watchdog watchdog(*this);
LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", args.cmd, args.lsa, args.eal, args.tag, args.size);
LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
switch (args.cmd)
switch (ch_mfc_cmd.cmd)
{
case MFC_GETLLAR_CMD:
{
const u32 addr = args.eal & -128u;
const u32 addr = ch_mfc_cmd.eal & -128u;
auto& data = vm::_ref<decltype(rdata)>(addr);
if (raddr && raddr != addr)
{
ch_event_stat |= SPU_EVENT_LR;
}
raddr = addr;
auto& dst = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
u64 ntime;
const bool is_polling = false; // TODO
if (is_polling)
{
rtime = vm::reservation_acquire(raddr, 128);
rtime = vm::reservation_acquire(addr, 128);
while (rdata == data && vm::reservation_acquire(raddr, 128) == rtime)
while (rdata == data && vm::reservation_acquire(addr, 128) == rtime)
{
if (is_stopped())
{
@ -1235,57 +1249,78 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
{
u64 count = 1;
while (g_cfg.core.spu_accurate_getllar && !spu_getll_tx(raddr, rdata.data(), &rtime))
if (g_cfg.core.spu_accurate_getllar)
{
std::this_thread::yield();
count += 2;
while ((ntime = spu_getll_tx(addr, dst.data())) & 1)
{
std::this_thread::yield();
count += 2;
}
}
if (!g_cfg.core.spu_accurate_getllar)
else
{
for (;; count++, busy_wait(300))
{
rtime = vm::reservation_acquire(raddr, 128);
rdata = data;
ntime = vm::reservation_acquire(addr, 128);
dst = data;
if (LIKELY(vm::reservation_acquire(raddr, 128) == rtime))
if (LIKELY(vm::reservation_acquire(addr, 128) == ntime))
{
break;
}
}
}
if (count > 9)
if (count > 15)
{
LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
}
}
else
{
auto& res = vm::reservation_lock(raddr, 128);
auto& res = vm::reservation_lock(addr, 128);
if (g_cfg.core.spu_accurate_getllar)
{
vm::_ref<atomic_t<u32>>(raddr) += 0;
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
vm::writer_lock lock(addr);
rtime = res & ~1ull;
rdata = data;
ntime = res & ~1ull;
mov_rdata(dst.data(), data.data());
res &= ~1ull;
}
else
{
rtime = res & ~1ull;
rdata = data;
ntime = res & ~1ull;
mov_rdata(dst.data(), data.data());
res &= ~1ull;
}
}
// Copy to LS
_ref<decltype(rdata)>(args.lsa & 0x3ff80) = rdata;
if (const u32 _addr = raddr)
{
// Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(_addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(_addr))
{
ch_event_stat |= SPU_EVENT_LR;
if (_addr == addr)
{
// Lost current reservation
raddr = 0;
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
}
}
raddr = addr;
rtime = ntime;
mov_rdata(rdata.data(), dst.data());
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
@ -1293,40 +1328,50 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_PUTLLC_CMD:
{
// Store conditionally
const u32 addr = args.eal & -128u;
const u32 addr = ch_mfc_cmd.eal & -128u;
u32 result = 0;
bool result = false;
if (raddr == addr && rtime == vm::reservation_acquire(raddr, 128))
if (raddr == addr && rtime == (vm::reservation_acquire(raddr, 128) & ~1ull))
{
const auto& to_write = _ref<decltype(rdata)>(args.lsa & 0x3ff80);
const auto& to_write = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
if (LIKELY(g_use_rtm))
{
if (spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data()))
while (true)
{
vm::reservation_notifier(raddr, 128).notify_all();
result = true;
}
result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
if (result < 2)
{
break;
}
// Don't fallback to heavyweight lock, just give up
// Retry
std::this_thread::yield();
}
}
else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
{
auto& res = vm::reservation_lock(raddr, 128);
vm::_ref<atomic_t<u32>>(raddr) += 0;
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
if (rtime == (res & ~1ull) && rdata == data)
if (rtime == (res & ~1ull))
{
data = to_write;
vm::reservation_update(raddr, 128);
vm::reservation_notifier(raddr, 128).notify_all();
result = true;
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
if (rdata == data)
{
mov_rdata(data.data(), to_write.data());
res++;
result = 1;
}
else
{
res &= ~1ull;
}
}
else
{
@ -1337,16 +1382,21 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
if (result)
{
vm::reservation_notifier(addr, 128).notify_all();
ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
}
else
{
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
}
if (raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & ~1ull) || rdata != vm::_ref<decltype(rdata)>(raddr))
{
ch_event_stat |= SPU_EVENT_LR;
}
}
if (raddr && !result)
{
ch_event_stat |= SPU_EVENT_LR;
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
}
raddr = 0;
@ -1354,23 +1404,22 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
}
case MFC_PUTLLUC_CMD:
{
do_putlluc(args);
do_putlluc(ch_mfc_cmd);
ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
return true;
}
case MFC_PUTQLLUC_CMD:
{
const u32 mask = utils::rol32(1, args.tag);
const u32 mask = utils::rol32(1, ch_mfc_cmd.tag);
if (UNLIKELY((mfc_barrier | mfc_fence) & mask))
{
args.size = 0;
mfc_queue[mfc_size++] = args;
mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_fence |= mask;
}
else
{
do_putlluc(args);
do_putlluc(ch_mfc_cmd);
}
return true;
@ -1379,7 +1428,11 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_SNDSIGB_CMD:
case MFC_SNDSIGF_CMD:
{
args.size = 4;
if (ch_mfc_cmd.size != 4)
{
break;
}
// Fallthrough
}
case MFC_PUT_CMD:
@ -1392,24 +1445,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_GETB_CMD:
case MFC_GETF_CMD:
{
if (LIKELY(args.size <= 0x4000))
if (LIKELY(ch_mfc_cmd.size <= 0x4000))
{
if (LIKELY(do_dma_check(args)))
if (LIKELY(do_dma_check(ch_mfc_cmd)))
{
if (LIKELY(args.size))
if (ch_mfc_cmd.size)
{
do_dma_transfer(args);
do_dma_transfer(ch_mfc_cmd);
}
return true;
}
mfc_queue[mfc_size++] = args;
mfc_fence |= utils::rol32(1, args.tag);
mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
if (args.cmd & MFC_BARRIER_MASK)
if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK)
{
mfc_barrier |= utils::rol32(1, args.tag);
mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag);
}
return true;
@ -1427,22 +1480,25 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_GETLB_CMD:
case MFC_GETLF_CMD:
{
if (LIKELY(args.size <= 0x4000))
if (LIKELY(ch_mfc_cmd.size <= 0x4000))
{
if (LIKELY(do_dma_check(args)))
auto& cmd = mfc_queue[mfc_size];
cmd = ch_mfc_cmd;
if (LIKELY(do_dma_check(cmd)))
{
if (LIKELY(do_list_transfer(args)))
if (LIKELY(do_list_transfer(cmd)))
{
return true;
}
}
mfc_queue[mfc_size++] = args;
mfc_fence |= utils::rol32(1, args.tag);
mfc_size++;
mfc_fence |= utils::rol32(1, cmd.tag);
if (args.cmd & MFC_BARRIER_MASK)
if (cmd.cmd & MFC_BARRIER_MASK)
{
mfc_barrier |= utils::rol32(1, args.tag);
mfc_barrier |= utils::rol32(1, cmd.tag);
}
return true;
@ -1460,7 +1516,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
}
else
{
mfc_queue[mfc_size++] = args;
mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_barrier |= -1;
}
@ -1473,7 +1529,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
}
fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE,
args.cmd, args.lsa, args.eal, args.tag, args.size);
ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
}
u32 spu_thread::get_events(bool waiting)
@ -1486,7 +1542,7 @@ u32 spu_thread::get_events(bool waiting)
}
// Check reservation status and set SPU_EVENT_LR if lost
if (raddr && (vm::reservation_acquire(raddr, sizeof(rdata)) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
{
ch_event_stat |= SPU_EVENT_LR;
raddr = 0;
@ -2026,7 +2082,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
case MFC_Cmd:
{
ch_mfc_cmd.cmd = MFC(value & 0xff);
return process_mfc_cmd(ch_mfc_cmd);
return process_mfc_cmd();
}
case MFC_WrListStallAck:

View file

@ -1,8 +1,9 @@
#pragma once
#pragma once
#include "Emu/Cell/Common.h"
#include "Emu/CPU/CPUThread.h"
#include "Emu/Cell/SPUInterpreter.h"
#include "Emu/Memory/vm.h"
#include "MFC.h"
#include <map>
@ -595,7 +596,7 @@ public:
void do_mfc(bool wait = true);
u32 get_mfc_completed();
bool process_mfc_cmd(spu_mfc_cmd args);
bool process_mfc_cmd();
u32 get_events(bool waiting = false);
void set_events(u32 mask);
void set_interrupt_status(bool enable);

View file

@ -57,8 +57,12 @@ namespace vm
// Memory mutex acknowledgement
thread_local atomic_t<cpu_thread*>* g_tls_locked = nullptr;
// Currently locked address
atomic_t<u32> g_addr_lock = 0;
// Memory mutex: passive locks
std::array<atomic_t<cpu_thread*>, 32> g_locks;
std::array<atomic_t<cpu_thread*>, 4> g_locks{};
std::array<atomic_t<u64>, 6> g_range_locks{};
static void _register_lock(cpu_thread* _cpu)
{
@ -72,11 +76,25 @@ namespace vm
}
}
bool passive_lock(cpu_thread& cpu, bool wait)
static atomic_t<u64>* _register_range_lock(const u64 lock_info)
{
while (true)
{
for (auto& lock : g_range_locks)
{
if (!lock && lock.compare_and_swap_test(0, lock_info))
{
return &lock;
}
}
}
}
void passive_lock(cpu_thread& cpu)
{
if (UNLIKELY(g_tls_locked && *g_tls_locked == &cpu))
{
return true;
return;
}
if (LIKELY(g_mutex.is_lockable()))
@ -84,31 +102,46 @@ namespace vm
// Optimistic path (hope that mutex is not exclusively locked)
_register_lock(&cpu);
if (UNLIKELY(!g_mutex.is_lockable()))
if (LIKELY(g_mutex.is_lockable()))
{
passive_unlock(cpu);
if (!wait)
{
return false;
}
::reader_lock lock(g_mutex);
_register_lock(&cpu);
return;
}
passive_unlock(cpu);
}
else
::reader_lock lock(g_mutex);
_register_lock(&cpu);
}
atomic_t<u64>* passive_lock(const u32 addr, const u32 end)
{
static const auto test_addr = [](const u32 target, const u32 addr, const u32 end)
{
if (!wait)
return addr > target || end <= target;
};
atomic_t<u64>* _ret;
if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
{
// Optimistic path (hope that address range is not locked)
_ret = _register_range_lock((u64)end << 32 | addr);
if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
{
return false;
return _ret;
}
::reader_lock lock(g_mutex);
_register_lock(&cpu);
*_ret = 0;
}
return true;
{
::reader_lock lock(g_mutex);
_ret = _register_range_lock((u64)end << 32 | addr);
}
return _ret;
}
void passive_unlock(cpu_thread& cpu)
@ -194,8 +227,7 @@ namespace vm
m_upgraded = true;
}
writer_lock::writer_lock(int full)
: locked(true)
writer_lock::writer_lock(u32 addr)
{
auto cpu = get_current_cpu_thread();
@ -206,7 +238,7 @@ namespace vm
g_mutex.lock();
if (full)
if (addr)
{
for (auto& lock : g_locks)
{
@ -216,6 +248,30 @@ namespace vm
}
}
g_addr_lock = addr;
for (auto& lock : g_range_locks)
{
while (true)
{
const u64 value = lock;
// Test beginning address
if (static_cast<u32>(value) > addr)
{
break;
}
// Test end address
if (static_cast<u32>(value >> 32) <= addr)
{
break;
}
_mm_pause();
}
}
for (auto& lock : g_locks)
{
while (cpu_thread* ptr = lock)
@ -225,7 +281,7 @@ namespace vm
break;
}
busy_wait();
_mm_pause();
}
}
}
@ -239,10 +295,8 @@ namespace vm
writer_lock::~writer_lock()
{
if (locked)
{
g_mutex.unlock();
}
g_addr_lock.raw() = 0;
g_mutex.unlock();
}
void reservation_lock_internal(atomic_t<u64>& res)

View file

@ -53,7 +53,8 @@ namespace vm
extern thread_local atomic_t<cpu_thread*>* g_tls_locked;
// Register reader
bool passive_lock(cpu_thread& cpu, bool wait = true);
void passive_lock(cpu_thread& cpu);
atomic_t<u64>* passive_lock(const u32 begin, const u32 end);
// Unregister reader
void passive_unlock(cpu_thread& cpu);
@ -80,14 +81,10 @@ namespace vm
struct writer_lock final
{
const bool locked;
writer_lock(const writer_lock&) = delete;
writer_lock& operator=(const writer_lock&) = delete;
writer_lock(int full);
writer_lock(u32 addr = 0);
~writer_lock();
explicit operator bool() const { return locked; }
};
// Get reservation status for further atomic update: last update timestamp
@ -101,7 +98,7 @@ namespace vm
inline void reservation_update(u32 addr, u32 size, bool lsb = false)
{
// Update reservation info with new timestamp
reservation_acquire(addr, size) = (__rdtsc() << 1) | u64{lsb};
reservation_acquire(addr, size) += 2;
}
// Get reservation sync variable

View file

@ -354,7 +354,7 @@ struct cfg_root : cfg::node
node_core(cfg::node* _this) : cfg::node(_this, "Core") {}
cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm};
cfg::_int<1, 16> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
cfg::_int<1, 4> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
cfg::_bool ppu_debug{this, "PPU Debug"};
cfg::_bool llvm_logs{this, "Save LLVM logs"};
cfg::string llvm_cpu{this, "Use LLVM CPU"};