From 5bd5a382c0e52dafa34474ab4e345f11b57bd13c Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 11 Oct 2020 17:32:00 +0300 Subject: [PATCH] PPU: fix LDARX/LWARX in accurate mode (closes #9058) Fixup after #9048 Use SSE intrinsics in mov_rdata. --- rpcs3/Emu/Cell/PPUThread.cpp | 11 ++++++++++- rpcs3/Emu/Cell/SPUThread.cpp | 21 +++++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 6fcded1cc5..63a36ad4b8 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1176,7 +1176,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) continue; } - const be_t rdata = data.load(); + be_t rdata; if (ppu.use_full_rdata) { @@ -1187,6 +1187,10 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) mov_rdata(ppu.rdata, vm::_ref(addr & -128)); } + else + { + rdata = data.load(); + } if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]] { @@ -1212,6 +1216,11 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) // Store only 64 bits of reservation data std::memcpy(&ppu.rdata[addr & 0x78], &rdata, 8); } + else + { + // Load relevant 64 bits of reservation data + std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8); + } return static_cast(rdata << data_off >> size_off); } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 5f5654da92..e3bc4a20ca 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -197,8 +197,25 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src) return; } - // TODO: use std::assume_aligned - std::memcpy(reinterpret_cast(_dst), reinterpret_cast(_src), 128); + { + const __m128i v0 = _mm_loadu_si128(reinterpret_cast(_src + 0)); + const __m128i v1 = _mm_loadu_si128(reinterpret_cast(_src + 16)); + const __m128i v2 = _mm_loadu_si128(reinterpret_cast(_src + 32)); + const __m128i v3 = _mm_loadu_si128(reinterpret_cast(_src + 48)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 0), v0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 16), v1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 32), v2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 48), v3); + } + + const __m128i v0 = _mm_loadu_si128(reinterpret_cast(_src + 64)); + const __m128i v1 = _mm_loadu_si128(reinterpret_cast(_src + 80)); + const __m128i v2 = _mm_loadu_si128(reinterpret_cast(_src + 96)); + const __m128i v3 = _mm_loadu_si128(reinterpret_cast(_src + 112)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 64), v0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 80), v1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 96), v2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3); } extern u64 get_timebased_time();