From cbd1b28d0dfcc0a1da72cf454deef1d00e0cb02d Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Fri, 14 Jul 2017 17:00:49 +0300
Subject: [PATCH] spu: Add polling loop detection and clean up channel access
 contention code - spus run a tight gpu-style kernel with no multitasking on
 the cores themselves -- this does not map well to PC processor cores because
 they never sleep even when doing nothing -- the poll detection hack tries to
 find a good place to insert a scheduler yield -- RdDec is a good spot as it
 signifies the spu kernel is waiting on a timer

---
 rpcs3/Emu/Cell/SPUThread.cpp | 36 ++++++++++++++++++++++++++++--------
 rpcs3/Emu/System.h           |  3 +++
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 21bdde58ae..156b582fcc 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -72,17 +72,26 @@ namespace spu
 		//TODO: Only initialize loaded memory blocks to save RAM
 		//TODO: Concurrent spu thread limit can be configurable
 		std::array<executable_block_map, 65536> atomic_instruction_table;
-		constexpr u8 max_concurrent_instructions = 1;
+		constexpr u32 native_jiffy_duration_us = 2000000;
 
 		void acquire_pc_address(u32 pc, u32 timeout_ms = 3)
 		{
+			const u8 max_concurrent_instructions = (u8)g_cfg.core.preferred_spu_threads;
+
 			const u32 block = pc >> 12;
 			const u32 offset = (pc & 0xFFF) >> 2;
 
-			while (timeout_ms--)
+			if (timeout_ms > 0)
 			{
-				if (atomic_instruction_table[block][offset].load(std::memory_order_consume) >= max_concurrent_instructions)
-					std::this_thread::sleep_for(1ms);
+				while (timeout_ms--)
+				{
+					if (atomic_instruction_table[block][offset].load(std::memory_order_consume) >= max_concurrent_instructions)
+						std::this_thread::sleep_for(1ms);
+				}
+			}
+			else
+			{
+				std::this_thread::yield();
 			}
 
 			atomic_instruction_table[block][offset]++;
@@ -99,16 +108,22 @@ namespace spu
 		struct concurrent_execution_watchdog
 		{
 			u32 pc = 0;
+			bool active = false;
 
 			concurrent_execution_watchdog(SPUThread& spu)
 				:pc(spu.pc)
 			{
-				acquire_pc_address(pc);
+				if (g_cfg.core.preferred_spu_threads > 0)
+				{
+					acquire_pc_address(pc, (u32)g_cfg.core.spu_delay_penalty);
+					active = true;
+				}
 			}
 
 			~concurrent_execution_watchdog()
 			{
-				release_pc_address(pc);
+				if (active)
+					release_pc_address(pc);
 			}
 		};
 	}
@@ -559,7 +574,7 @@ void SPUThread::process_mfc_cmd()
 			}
 
 			// TODO: investigate lost notifications
-			std::this_thread::sleep_for(0us);
+			std::this_thread::yield();
 			_mm_lfence();
 		}
 	};
@@ -957,7 +972,7 @@ bool SPUThread::get_ch_value(u32 ch, u32& out)
 			if (ctr > 10000)
 			{
 				ctr = 0;
-				std::this_thread::sleep_for(0us);
+				std::this_thread::yield();
 			}
 			else
 			{
@@ -1038,6 +1053,11 @@ bool SPUThread::get_ch_value(u32 ch, u32& out)
 	case SPU_RdDec:
 	{
 		out = ch_dec_value - (u32)(get_timebased_time() - ch_dec_start_timestamp);
+
+		//Polling: We might as well hint to the scheduler to slot in another thread since this one is counting down
+		if (g_cfg.core.spu_loop_detection && out > spu::scheduler::native_jiffy_duration_us)
+			std::this_thread::yield();
+
 		return true;
 	}
 
diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h
index f8a01640c3..d951b534b5 100644
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@@ -272,6 +272,9 @@ struct cfg_root : cfg::node
 		cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"};
 		cfg::_bool spu_debug{this, "SPU Debug"};
 		cfg::_int<32, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC
+		cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks
+		cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free
+		cfg::_bool spu_loop_detection{this, "SPU loop detection", false}; //Try to detect wait loops and trigger thread yield
 
 		cfg::_enum<lib_loading_type> lib_loading{this, "Lib Loader", lib_loading_type::automatic};
 		cfg::_bool hook_functions{this, "Hook static functions"};