From cbd1b28d0dfcc0a1da72cf454deef1d00e0cb02d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Fri, 14 Jul 2017 17:00:49 +0300 Subject: [PATCH] spu: Add polling loop detection and clean up channel access contention code - spus run a tight gpu-style kernel with no multitasking on the cores themselves -- this does not map well to PC processor cores because they never sleep even when doing nothing -- the poll detection hack tries to find a good place to insert a scheduler yield -- RdDec is a good spot as it signifies the spu kernel is waiting on a timer --- rpcs3/Emu/Cell/SPUThread.cpp | 36 ++++++++++++++++++++++++++++-------- rpcs3/Emu/System.h | 3 +++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 21bdde58ae..156b582fcc 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -72,17 +72,26 @@ namespace spu //TODO: Only initialize loaded memory blocks to save RAM //TODO: Concurrent spu thread limit can be configurable std::array atomic_instruction_table; - constexpr u8 max_concurrent_instructions = 1; + constexpr u32 native_jiffy_duration_us = 2000000; void acquire_pc_address(u32 pc, u32 timeout_ms = 3) { + const u8 max_concurrent_instructions = (u8)g_cfg.core.preferred_spu_threads; + const u32 block = pc >> 12; const u32 offset = (pc & 0xFFF) >> 2; - while (timeout_ms--) + if (timeout_ms > 0) { - if (atomic_instruction_table[block][offset].load(std::memory_order_consume) >= max_concurrent_instructions) - std::this_thread::sleep_for(1ms); + while (timeout_ms--) + { + if (atomic_instruction_table[block][offset].load(std::memory_order_consume) >= max_concurrent_instructions) + std::this_thread::sleep_for(1ms); + } + } + else + { + std::this_thread::yield(); } atomic_instruction_table[block][offset]++; @@ -99,16 +108,22 @@ namespace spu struct concurrent_execution_watchdog { u32 pc = 0; + bool active = false; concurrent_execution_watchdog(SPUThread& spu) :pc(spu.pc) { - acquire_pc_address(pc); + if (g_cfg.core.preferred_spu_threads > 0) + { + acquire_pc_address(pc, (u32)g_cfg.core.spu_delay_penalty); + active = true; + } } ~concurrent_execution_watchdog() { - release_pc_address(pc); + if (active) + release_pc_address(pc); } }; } @@ -559,7 +574,7 @@ void SPUThread::process_mfc_cmd() } // TODO: investigate lost notifications - std::this_thread::sleep_for(0us); + std::this_thread::yield(); _mm_lfence(); } }; @@ -957,7 +972,7 @@ bool SPUThread::get_ch_value(u32 ch, u32& out) if (ctr > 10000) { ctr = 0; - std::this_thread::sleep_for(0us); + std::this_thread::yield(); } else { @@ -1038,6 +1053,11 @@ bool SPUThread::get_ch_value(u32 ch, u32& out) case SPU_RdDec: { out = ch_dec_value - (u32)(get_timebased_time() - ch_dec_start_timestamp); + + //Polling: We might as well hint to the scheduler to slot in another thread since this one is counting down + if (g_cfg.core.spu_loop_detection && out > spu::scheduler::native_jiffy_duration_us) + std::this_thread::yield(); + return true; } diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index f8a01640c3..d951b534b5 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -272,6 +272,9 @@ struct cfg_root : cfg::node cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"}; cfg::_bool spu_debug{this, "SPU Debug"}; cfg::_int<32, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC + cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks + cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free + cfg::_bool spu_loop_detection{this, "SPU loop detection", false}; //Try to detect wait loops and trigger thread yield cfg::_enum lib_loading{this, "Lib Loader", lib_loading_type::automatic}; cfg::_bool hook_functions{this, "Hook static functions"};