From 4ff77a8555b3878d9d91dd17c16d3ba1a4149569 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 19 Jun 2019 22:01:48 +0300 Subject: [PATCH] rsx: Improve balancing of the offloader thread - Use two counters to avoid atomic operations - Yield instead of sleeping because some games are very sensitive to timing --- rpcs3/Emu/Cell/SPUThread.cpp | 2 +- rpcs3/Emu/RSX/RSXOffload.cpp | 32 ++++++++++++++++++++++---------- rpcs3/Emu/RSX/RSXOffload.h | 5 +++-- rpcs3/Emu/RSX/RSXThread.cpp | 3 +++ 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 184122ba5e..d0c2b8ef37 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1797,7 +1797,7 @@ bool spu_thread::process_mfc_cmd() break; } - thread_ctrl::wait_for(100); + thread_ctrl::wait_for(500); } if (test_stopped()) diff --git a/rpcs3/Emu/RSX/RSXOffload.cpp b/rpcs3/Emu/RSX/RSXOffload.cpp index 84cb060a7e..66347510b3 100644 --- a/rpcs3/Emu/RSX/RSXOffload.cpp +++ b/rpcs3/Emu/RSX/RSXOffload.cpp @@ -4,12 +4,20 @@ #include "Emu/System.h" #include "RSXOffload.h" +#include + namespace rsx { // initialization void dma_manager::init() { m_worker_state = thread_state::created; + m_enqueued_count.store(0); + m_processed_count = 0; + + // Empty work queue in case of stale contents + m_work_queue.pop_all(); + thread_ctrl::spawn("RSX offloader", [this]() { if (!g_cfg.video.multithreaded_rsx) @@ -25,7 +33,7 @@ namespace rsx while (m_worker_state != thread_state::finished) { - if (m_jobs_count) + if (m_enqueued_count.load() != m_processed_count) { for (auto slice = m_work_queue.pop_all(); slice; slice.pop_front()) { @@ -49,16 +57,17 @@ namespace rsx fmt::throw_exception("Unreachable" HERE); } - m_jobs_count--; + ++m_processed_count; } } else { - thread_ctrl::wait_for(500); + // Yield + std::this_thread::yield(); } } - m_jobs_count.store(0); + m_processed_count = m_enqueued_count.load(); }); } @@ -71,7 +80,7 @@ namespace rsx } else { - ++m_jobs_count; + ++m_enqueued_count; m_work_queue.push(dst, src, length); } } @@ -84,7 +93,7 @@ namespace rsx } else { - ++m_jobs_count; + ++m_enqueued_count; m_work_queue.push(dst, src, length); } } @@ -99,7 +108,7 @@ namespace rsx } else { - ++m_jobs_count; + ++m_enqueued_count; m_work_queue.push(dst, primitive, count); } } @@ -107,11 +116,14 @@ namespace rsx // Synchronization void dma_manager::sync() { - if (g_cfg.video.multithreaded_rsx) + if (LIKELY(m_enqueued_count.load() == m_processed_count)) { - while (m_jobs_count) - _mm_lfence(); + // Nothing to do + return; } + + while (m_enqueued_count.load() != m_processed_count) + _mm_lfence(); } void dma_manager::join() diff --git a/rpcs3/Emu/RSX/RSXOffload.h b/rpcs3/Emu/RSX/RSXOffload.h index a99fe6786c..7f1b4e34ee 100644 --- a/rpcs3/Emu/RSX/RSXOffload.h +++ b/rpcs3/Emu/RSX/RSXOffload.h @@ -42,8 +42,9 @@ namespace rsx }; lf_queue m_work_queue; - atomic_t m_jobs_count; - thread_state m_worker_state; + atomic_t m_enqueued_count{ 0 }; + volatile u64 m_processed_count = 0; + thread_state m_worker_state = thread_state::detached; // TODO: Improved benchmarks here; value determined by profiling on a Ryzen CPU, rounded to the nearest 512 bytes const u32 max_immediate_transfer_size = 3584; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 3e7ac167ca..5f87e8aafc 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2241,6 +2241,9 @@ namespace rsx // Fragment constants may have been updated m_graphics_state |= rsx::pipeline_state::fragment_constants_dirty; + // DMA sync; if you need this, don't use MTRSX + // g_dma_manager.sync(); + //TODO: On sync every sub-unit should finish any pending tasks //Might cause zcull lockup due to zombie 'unclaimed reports' which are not forcefully removed currently //verify (HERE), async_tasks_pending.load() == 0;