SPU: use reservation spinlocks on writes (non-TSX)

This should decrease contention by avoiding global lock
This commit is contained in:
Nekotekina 2018-05-21 20:25:05 +03:00
parent 182259e4a5
commit 72574b11ff
5 changed files with 209 additions and 134 deletions

View file

@ -944,7 +944,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
ppu.raddr = addr;
while (g_use_rtm)
while (LIKELY(g_use_rtm))
{
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
ppu.rdata = data;
@ -959,30 +959,46 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
}
}
// Do several attemps
for (uint i = 0; i < 5; i++)
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
if (LIKELY((ppu.rtime & 1) == 0))
{
ppu.rdata = data;
if (LIKELY(vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime))
{
return static_cast<T>(ppu.rdata);
}
}
vm::temporary_unlock(ppu);
for (u64 i = 0;; i++)
{
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
_mm_lfence();
// Check LSB: atomic store may be in progress
if (LIKELY((ppu.rtime & 1) == 0))
{
ppu.rdata = data;
_mm_lfence();
if (LIKELY(vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime))
{
return static_cast<T>(ppu.rdata);
break;
}
}
busy_wait(300);
if (i < 20)
{
busy_wait(300);
}
else
{
std::this_thread::yield();
}
}
vm::reader_lock lock;
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
ppu.rdata = data;
ppu.cpu_mem();
return static_cast<T>(ppu.rdata);
}
@ -1053,7 +1069,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
return false;
}
if (g_use_rtm)
if (LIKELY(g_use_rtm))
{
if (ppu_stwcx_tx(addr, ppu.rtime, ppu.rdata, reg_value))
{
@ -1067,16 +1083,23 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
return false;
}
vm::writer_lock lock(0);
vm::temporary_unlock(ppu);
const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u32)) && data.compare_and_swap_test(static_cast<u32>(ppu.rdata), reg_value);
auto& res = vm::reservation_lock(addr, sizeof(u32));
const bool result = ppu.rtime == (res & ~1ull) && data.compare_and_swap_test(static_cast<u32>(ppu.rdata), reg_value);
if (result)
{
vm::reservation_update(addr, sizeof(u32));
vm::reservation_notifier(addr, sizeof(u32)).notify_all();
}
else
{
res &= ~1ull;
}
ppu.cpu_mem();
ppu.raddr = 0;
return result;
}
@ -1138,7 +1161,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
if (g_use_rtm)
if (LIKELY(g_use_rtm))
{
if (ppu_stdcx_tx(addr, ppu.rtime, ppu.rdata, reg_value))
{
@ -1152,16 +1175,23 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
vm::writer_lock lock(0);
vm::temporary_unlock(ppu);
const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u64)) && data.compare_and_swap_test(ppu.rdata, reg_value);
auto& res = vm::reservation_lock(addr, sizeof(u64));
const bool result = ppu.rtime == (res & ~1ull) && data.compare_and_swap_test(ppu.rdata, reg_value);
if (result)
{
vm::reservation_update(addr, sizeof(u64));
vm::reservation_notifier(addr, sizeof(u64)).notify_all();
}
else
{
res &= ~1ull;
}
ppu.cpu_mem();
ppu.raddr = 0;
return result;
}

View file

@ -806,6 +806,79 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args)
void* dst = vm::base(eal);
void* src = vm::base(offset + lsa);
if (UNLIKELY(!is_get && !g_use_rtm))
{
switch (u32 size = args.size)
{
case 1:
{
auto& res = vm::reservation_lock(eal, 1);
*static_cast<u8*>(dst) = *static_cast<const u8*>(src);
res &= ~1ull;
break;
}
case 2:
{
auto& res = vm::reservation_lock(eal, 2);
*static_cast<u16*>(dst) = *static_cast<const u16*>(src);
res &= ~1ull;
break;
}
case 4:
{
auto& res = vm::reservation_lock(eal, 4);
*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
res &= ~1ull;
break;
}
case 8:
{
auto& res = vm::reservation_lock(eal, 8);
*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
res &= ~1ull;
break;
}
case 16:
{
auto& res = vm::reservation_lock(eal, 16);
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
res &= ~1ull;
break;
}
default:
{
auto* res = &vm::reservation_lock(eal, 16);
auto vdst = static_cast<__m128i*>(dst);
auto vsrc = static_cast<const __m128i*>(src);
for (u32 addr = eal, end = eal + size;; vdst++, vsrc++)
{
_mm_store_si128(vdst, _mm_load_si128(vsrc));
addr += 16;
if (addr == end)
{
break;
}
if (addr % 128)
{
continue;
}
res->fetch_and(~1ull);
res = &vm::reservation_lock(addr, 16);
}
res->fetch_and(~1ull);
break;
}
}
return;
}
if (is_get)
{
std::swap(dst, src);
@ -825,24 +898,17 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args)
}
case 4:
{
//if (is_get && !from_mfc)
{
*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
break;
}
//_mm_stream_si32(static_cast<s32*>(dst), *static_cast<const s32*>(src));
*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
break;
}
case 8:
{
//if (is_get && !from_mfc)
{
*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
break;
}
//_mm_stream_si64(static_cast<s64*>(dst), *static_cast<const s64*>(src));
*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
break;
}
case 16:
{
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
break;
}
default:
@ -851,45 +917,6 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args)
auto vsrc = static_cast<const __m128i*>(src);
auto vcnt = size / sizeof(__m128i);
//if (is_get && !from_mfc)
{
while (vcnt >= 8)
{
const __m128i data[]
{
_mm_load_si128(vsrc + 0),
_mm_load_si128(vsrc + 1),
_mm_load_si128(vsrc + 2),
_mm_load_si128(vsrc + 3),
_mm_load_si128(vsrc + 4),
_mm_load_si128(vsrc + 5),
_mm_load_si128(vsrc + 6),
_mm_load_si128(vsrc + 7),
};
_mm_store_si128(vdst + 0, data[0]);
_mm_store_si128(vdst + 1, data[1]);
_mm_store_si128(vdst + 2, data[2]);
_mm_store_si128(vdst + 3, data[3]);
_mm_store_si128(vdst + 4, data[4]);
_mm_store_si128(vdst + 5, data[5]);
_mm_store_si128(vdst + 6, data[6]);
_mm_store_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
vdst += 8;
}
while (vcnt--)
{
_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
}
break;
}
// Disabled
while (vcnt >= 8)
{
const __m128i data[]
@ -904,14 +931,14 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args)
_mm_load_si128(vsrc + 7),
};
_mm_stream_si128(vdst + 0, data[0]);
_mm_stream_si128(vdst + 1, data[1]);
_mm_stream_si128(vdst + 2, data[2]);
_mm_stream_si128(vdst + 3, data[3]);
_mm_stream_si128(vdst + 4, data[4]);
_mm_stream_si128(vdst + 5, data[5]);
_mm_stream_si128(vdst + 6, data[6]);
_mm_stream_si128(vdst + 7, data[7]);
_mm_store_si128(vdst + 0, data[0]);
_mm_store_si128(vdst + 1, data[1]);
_mm_store_si128(vdst + 2, data[2]);
_mm_store_si128(vdst + 3, data[3]);
_mm_store_si128(vdst + 4, data[4]);
_mm_store_si128(vdst + 5, data[5]);
_mm_store_si128(vdst + 6, data[6]);
_mm_store_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
@ -920,15 +947,11 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args)
while (vcnt--)
{
_mm_stream_si128(vdst++, _mm_load_si128(vsrc++));
_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
}
break;
}
}
if (is_get)
{
//_mm_sfence();
}
}
bool SPUThread::do_dma_check(const spu_mfc_cmd& args)
@ -1048,7 +1071,7 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args)
const auto to_write = _ref<decltype(rdata)>(args.lsa & 0x3ffff);
// Store unconditionally
if (g_use_rtm)
if (LIKELY(g_use_rtm))
{
const u64 count = spu_putlluc_tx(addr, to_write.data());
@ -1056,17 +1079,14 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args)
{
LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
}
vm::reservation_notifier(addr, 128).notify_all();
return;
}
else
{
auto& res = vm::reservation_lock(addr, 128);
data = to_write;
vm::reservation_update(addr, 128);
}
vm::writer_lock lock(0);
vm::reservation_update(addr, 128, true);
_mm_sfence();
data = to_write;
_mm_sfence();
vm::reservation_update(addr, 128);
vm::reservation_notifier(addr, 128).notify_all();
}
@ -1251,7 +1271,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
}
}
if (g_use_rtm)
if (LIKELY(g_use_rtm))
{
const u64 count = spu_getll_tx(raddr, rdata.data(), &rtime);
@ -1260,36 +1280,12 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
}
}
// Do several attemps
for (uint i = 0; !g_use_rtm && i < 5; i++)
else
{
rtime = vm::reservation_acquire(raddr, 128);
_mm_lfence();
// Check LSB: atomic store may be in progress
if (LIKELY((rtime & 1) == 0))
{
rdata = data;
_mm_lfence();
if (LIKELY(vm::reservation_acquire(raddr, 128) == rtime))
{
// Copy to LS
_ref<decltype(rdata)>(args.lsa & 0x3ffff) = rdata;
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
}
busy_wait(300);
}
if (!g_use_rtm)
{
vm::reader_lock lock;
rtime = vm::reservation_acquire(raddr, 128);
auto& res = vm::reservation_lock(raddr, 128);
rtime = res & ~1ull;
rdata = data;
res &= ~1ull;
}
// Copy to LS
@ -1308,7 +1304,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
if (raddr == args.eal && rtime == vm::reservation_acquire(raddr, 128))
{
if (g_use_rtm)
if (LIKELY(g_use_rtm))
{
if (spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data()))
{
@ -1320,20 +1316,24 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
}
else if (rdata == data)
{
auto& res = vm::reservation_lock(raddr, 128);
vm::_ref<atomic_t<u32>>(raddr) += 0;
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
if (rtime == (res & ~1ull) && rdata == data)
{
vm::reservation_update(raddr, 128, true);
_mm_sfence();
data = to_write;
_mm_sfence();
result = true;
vm::reservation_update(raddr, 128);
vm::reservation_notifier(raddr, 128).notify_all();
result = true;
}
else
{
res &= ~1ull;
}
}
}

View file

@ -9,6 +9,7 @@
#include "Emu/Cell/lv2/sys_memory.h"
#include "Emu/RSX/GSRender.h"
#include <atomic>
#include <thread>
#include <deque>
static_assert(sizeof(notifier) == 8, "Unexpected size of notifier");
@ -232,6 +233,26 @@ namespace vm
}
}
void reservation_lock_internal(atomic_t<u64>& res)
{
for (u64 i = 0;; i++)
{
if (LIKELY(!atomic_storage<u64>::bts(res.raw(), 0)))
{
break;
}
if (i < 15)
{
busy_wait(500);
}
else
{
std::this_thread::yield();
}
}
}
// Page information
struct memory_page
{
@ -479,12 +500,21 @@ namespace vm
, size(size)
, flags(flags)
{
// Allocate compressed reservation info area (avoid RSX and SPU areas)
if (addr != 0xc0000000 && addr != 0xe0000000)
// Allocate compressed reservation info area (avoid SPU MMIO area)
if (addr != 0xe0000000)
{
utils::memory_commit(g_reservations + addr / 16, size / 16);
utils::memory_commit(g_reservations2 + addr / 16, size / 16);
}
else
{
// RawSPU LS
for (u32 i = 0; i < 6; i++)
{
utils::memory_commit(g_reservations + addr / 16 + i * 0x10000, 0x4000);
utils::memory_commit(g_reservations2 + addr / 16 + i * 0x10000, 0x4000);
}
}
}
block_t::~block_t()

View file

@ -105,6 +105,20 @@ namespace vm
return *reinterpret_cast<notifier*>(g_reservations2 + addr / 128 * 8);
}
void reservation_lock_internal(atomic_t<u64>&);
inline atomic_t<u64>& reservation_lock(u32 addr, u32 size)
{
auto& res = vm::reservation_acquire(addr, size);
if (UNLIKELY(atomic_storage<u64>::bts(res.raw(), 0)))
{
reservation_lock_internal(res);
}
return res;
}
// Change memory protection of specified memory region
bool page_protect(u32 addr, u32 size, u8 flags_test = 0, u8 flags_set = 0, u8 flags_clear = 0);

View file

@ -118,14 +118,15 @@ namespace rsx
rsx->sync_point_request = true;
const u32 addr = get_address(method_registers.semaphore_offset_406e(), method_registers.semaphore_context_dma_406e());
if (g_use_rtm || addr >> 28 == 0x4)
if (LIKELY(g_use_rtm))
{
vm::write32(addr, arg);
}
else
{
vm::reader_lock lock;
auto& res = vm::reservation_lock(addr, 4);
vm::write32(addr, arg);
res &= ~1ull;
}
if (addr >> 28 != 0x4)