Rewrite Shader/LLVM Threads Limit Settings For Portability

2025-04-19 19:15:26 +00:00 · 2025-03-21 11:27:56 +02:00 · 2025-03-21 11:27:56 +02:00 · ac46f6193f
commit ac46f6193f
parent 1b2e286fb3
14 changed files with 106 additions and 71 deletions
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -3684,17 +3684,17 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)

 struct jit_core_allocator
 {
-	const s16 thread_count = g_cfg.core.llvm_threads ? std::min<s32>(g_cfg.core.llvm_threads, limit()) : limit();
+	const s32 thread_count = limit();

 	// Initialize global semaphore with the max number of threads
-	::semaphore<0x7fff> sem{std::max<s16>(thread_count, 1)};
+	::semaphore<0x7fff> sem{static_cast<s16>(thread_count)};

 	// Mutex for special extra-large modules to compile alone
 	shared_mutex shared_mtx;

-	static s16 limit()
+	static s32 limit()
 	{
-		return static_cast<s16>(std::min<s32>(0x7fff, utils::get_thread_count()));
+		return std::min<s32>(0x7fff, rpcs3::utils::get_max_threads());
 	}
 };

@ -4157,10 +4157,10 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 	// The growth in memory requirements of LLVM is not linear with file size of course
 	// But these estimates should hopefully protect RPCS3 in the coming years
 	// Especially when thread count is on the rise with each CPU generation 
-	atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));
+	atomic_t<u32> file_size_limit = static_cast<u32>(std::min<u64>(std::max<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536), u32{umax}));

-	const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
-	const u32 cpu_thread_limit = utils::get_thread_count() > 8u ? std::max<u32>(utils::get_thread_count(), 2) - 1 : utils::get_thread_count(); // One LLVM thread less
+	const u32 software_thread_limit = ::size32(file_queue);
+	const u32 cpu_thread_limit = std::min<u32>(std::max<u32>(rpcs3::utils::get_max_threads(), 2), software_thread_limit) - 1; // One LLVM thread less

 	std::vector<u128> decrypt_klics;

--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@ -7530,7 +7530,7 @@ struct spu_llvm

 		u32 worker_count = 1;

-		if (uint hc = utils::get_thread_count(); hc >= 12)
+		if (uint hc = rpcs3::utils::get_max_threads(); hc >= 12)
 		{
 			worker_count = hc - 12 + 3;
 		}
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -116,7 +116,7 @@ void GLGSRender::on_init_thread()
 			m_frame->delete_context(ctx);
 		};

-		gl::initialize_pipe_compiler(context_create_func, context_bind_func, context_destroy_func, g_cfg.video.shader_compiler_threads_count);
+		gl::initialize_pipe_compiler(context_create_func, context_bind_func, context_destroy_func, g_cfg.video.shader_threads_use_level);
 	}
 	else
 	{
--- a/rpcs3/Emu/RSX/GL/GLPipelineCompiler.cpp
+++ b/rpcs3/Emu/RSX/GL/GLPipelineCompiler.cpp
@ -2,6 +2,7 @@
 #include "GLPipelineCompiler.h"
 #include "Utilities/Thread.h"
 #include "util/sysinfo.hpp"
+#include "util/asm.hpp"

 namespace gl
 {
@ -91,28 +92,37 @@ namespace gl
 		std::function<draw_context_t()> context_create_func,
 		std::function<void(draw_context_t)> context_bind_func,
 		std::function<void(draw_context_t)> context_destroy_func,
-		int num_worker_threads)
+		int worker_threads_level)
 	{
-		if (num_worker_threads == 0)
+		// Select optimal number of compiler threads
+		u32 num_worker_threads = 0;
+
+		const auto hw_threads = utils::get_thread_count();
+		if (hw_threads > 16)
 		{
-			// Select optimal number of compiler threads
-			const auto hw_threads = utils::get_thread_count();
-			if (hw_threads > 12)
-			{
-				num_worker_threads = 6;
-			}
-			else if (hw_threads > 8)
-			{
-				num_worker_threads = 4;
-			}
-			else if (hw_threads == 8)
-			{
-				num_worker_threads = 2;
-			}
-			else
-			{
-				num_worker_threads = 1;
-			}
+			num_worker_threads = 8 + (hw_threads - 16) / 4;
+		}
+		else if (hw_threads > 12)
+		{
+			num_worker_threads = 6;
+		}
+		else if (hw_threads >= 8)
+		{
+			num_worker_threads = hw_threads - 7;
+		}
+		else
+		{
+			num_worker_threads = 1;
+		}
+
+		if (worker_threads_level == 1)
+		{
+			// Forced single-threaded mode
+			num_worker_threads = 1;
+		}
+		else if (worker_threads_level)
+		{
+			num_worker_threads = utils::aligned_div<u32>(num_worker_threads * worker_threads_level, 8);
 		}

 		ensure(num_worker_threads >= 1);
--- a/rpcs3/Emu/RSX/GL/GLPipelineCompiler.h
+++ b/rpcs3/Emu/RSX/GL/GLPipelineCompiler.h
@ -64,7 +64,7 @@ namespace gl
 		std::function<draw_context_t()> context_create_func,
 		std::function<void(draw_context_t)> context_bind_func,
 		std::function<void(draw_context_t)> context_destroy_func,
-		int num_worker_threads = -1);
+		int worker_threads_level = 0);

 	void destroy_pipe_compiler();
 	pipe_compiler* get_pipe_compiler();
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -572,7 +572,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
 	null_buffer_view = std::make_unique<vk::buffer_view>(*m_device, null_buffer->value, VK_FORMAT_R8_UINT, 0, 32);

 	spirv::initialize_compiler_context();
-	vk::initialize_pipe_compiler(g_cfg.video.shader_compiler_threads_count);
+	vk::initialize_pipe_compiler(g_cfg.video.shader_threads_use_level);

 	m_prog_buffer = std::make_unique<vk::program_cache>
 	(
--- a/rpcs3/Emu/RSX/VK/VKPipelineCompiler.cpp
+++ b/rpcs3/Emu/RSX/VK/VKPipelineCompiler.cpp
@ -5,6 +5,7 @@
 #include "Utilities/Thread.h"

 #include "util/sysinfo.hpp"
+#include "util/asm.hpp"

 namespace vk
 {
@ -206,28 +207,37 @@ namespace vk
 		return {};
 	}

-	void initialize_pipe_compiler(int num_worker_threads)
+	void initialize_pipe_compiler(int worker_threads_level)
 	{
-		if (num_worker_threads == 0)
+		// Select optimal number of compiler threads
+		u32 num_worker_threads = 0;
+
+		const auto hw_threads = utils::get_thread_count();
+		if (hw_threads > 16)
 		{
-			// Select optimal number of compiler threads
-			const auto hw_threads = utils::get_thread_count();
-			if (hw_threads > 12)
-			{
-				num_worker_threads = 6;
-			}
-			else if (hw_threads > 8)
-			{
-				num_worker_threads = 4;
-			}
-			else if (hw_threads == 8)
-			{
-				num_worker_threads = 2;
-			}
-			else
-			{
-				num_worker_threads = 1;
-			}
+			num_worker_threads = 8 + (hw_threads - 16) / 4;
+		}
+		else if (hw_threads > 12)
+		{
+			num_worker_threads = 6;
+		}
+		else if (hw_threads >= 8)
+		{
+			num_worker_threads = hw_threads - 7;
+		}
+		else
+		{
+			num_worker_threads = 1;
+		}
+
+		if (worker_threads_level == 1)
+		{
+			// Forced single-threaded mode
+			num_worker_threads = 1;
+		}
+		else if (worker_threads_level)
+		{
+			num_worker_threads = utils::aligned_div<u32>(num_worker_threads * worker_threads_level, 8);
 		}

 		ensure(num_worker_threads >= 1);
--- a/rpcs3/Emu/RSX/VK/VKPipelineCompiler.h
+++ b/rpcs3/Emu/RSX/VK/VKPipelineCompiler.h
@ -158,7 +158,7 @@ namespace vk
 			const std::vector<glsl::program_input>& vs_inputs, const std::vector<glsl::program_input>& fs_inputs);
 	};

-	void initialize_pipe_compiler(int num_worker_threads = -1);
+	void initialize_pipe_compiler(int worker_threads_level = 0);
 	void destroy_pipe_compiler();
 	pipe_compiler* get_pipe_compiler();
 }
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -25,7 +25,7 @@ struct cfg_root : cfg::node
 		cfg::_bool ppu_call_history{ this, "PPU Calling History" }; // Enable PPU calling history recording
 		cfg::_bool llvm_logs{ this, "Save LLVM logs" };
 		cfg::string llvm_cpu{ this, "Use LLVM CPU" };
-		cfg::_int<0, 1024> llvm_threads{ this, "Max LLVM Compile Threads", 0 };
+		cfg::_int<0, 8> llvm_threads_use_level{ this, "LLVM Compiler Threads Usage", 0 };
 		cfg::_bool ppu_llvm_greedy_mode{ this, "PPU LLVM Greedy Mode", false, false };
 		cfg::_bool llvm_precompilation{ this, "LLVM Precompilation", true };
 		cfg::_enum<thread_scheduler_mode> thread_scheduler{this, "Thread Scheduler Mode", thread_scheduler_mode::os};
@ -170,7 +170,7 @@ struct cfg_root : cfg::node
 		cfg::uint<0, 16> anisotropic_level_override{ this, "Anisotropic Filter Override", 0, true };
 		cfg::_float<-32, 32> texture_lod_bias{ this, "Texture LOD Bias Addend", 0, true };
 		cfg::_int<1, 1024> min_scalable_dimension{ this, "Minimum Scalable Dimension", 16 };
-		cfg::_int<0, 16> shader_compiler_threads_count{ this, "Shader Compiler Threads", 0 };
+		cfg::_int<0, 8> shader_threads_use_level{ this, "Shader Compiler Threads Usage", 0 };
 		cfg::_int<0, 30000000> driver_recovery_timeout{ this, "Driver Recovery Timeout", 1000000, true };
 		cfg::uint<0, 16667> driver_wakeup_delay{ this, "Driver Wake-Up Delay", 1, true };
 		cfg::_int<1, 3000> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways
--- a/rpcs3/Emu/system_utils.cpp
+++ b/rpcs3/Emu/system_utils.cpp
@ -5,6 +5,7 @@
 #include "Emu/Io/pad_config.h"
 #include "Emu/System.h"
 #include "util/sysinfo.hpp"
+#include "util/asm.hpp"
 #include "Utilities/File.h"
 #include "Utilities/Thread.h"
 #include "Crypto/unpkg.h"
@ -20,9 +21,23 @@ namespace rpcs3::utils
 {
 	u32 get_max_threads()
 	{
-		const u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
+		const u32 max_threads_level = static_cast<u32>(g_cfg.core.llvm_threads_use_level);
+
+		if (max_threads_level == 1)
+		{
+			// Forced single-threaded mode
+			return 1;
+		}
+
 		const u32 hw_threads = ::utils::get_thread_count();
-		const u32 thread_count = max_threads > 0 ? std::min(max_threads, hw_threads) : hw_threads;
+		const u32 thread_count = max_threads_level > 0 ? ::utils::aligned_div<u32>(hw_threads * max_threads_level, g_cfg.core.llvm_threads_use_level.max) : hw_threads;
+
+		if (max_threads_level == g_cfg.core.llvm_threads_use_level.max - 1 && thread_count == hw_threads)
+		{
+			// Level below the last should always be lower than max threads
+			return std::max<u32>(thread_count, 2) - 1;
+		}
+
 		return thread_count;
 	}

--- a/rpcs3/rpcs3qt/emu_settings_type.h
+++ b/rpcs3/rpcs3qt/emu_settings_type.h
@ -18,7 +18,7 @@ enum class emu_settings_type
 	PPUDebug,
 	SPUDebug,
 	MFCDebug,
-	MaxLLVMThreads,
+	LLVMThreadsUsage,
 	LLVMPrecompilation,
 	EnableTSX,
 	AccurateSpuDMA,
@ -91,7 +91,7 @@ enum class emu_settings_type
 	DisableOnDiskShaderCache,
 	DisableVulkanMemAllocator,
 	ShaderMode,
-	ShaderCompilerNumThreads,
+	ShaderThreadsUsage,
 	MultithreadedRSX,
 	VBlankRate,
 	VBlankNTSCFixup,
@ -220,7 +220,7 @@ inline static const std::map<emu_settings_type, cfg_location> settings_location
 	{ emu_settings_type::PPUDebug,                 { "Core", "PPU Debug"}},
 	{ emu_settings_type::SPUDebug,                 { "Core", "SPU Debug"}},
 	{ emu_settings_type::MFCDebug,                 { "Core", "MFC Debug"}},
-	{ emu_settings_type::MaxLLVMThreads,           { "Core", "Max LLVM Compile Threads"}},
+	{ emu_settings_type::LLVMThreadsUsage,         { "Core", "LLVM Compiler Threads Usage"}},
 	{ emu_settings_type::LLVMPrecompilation,       { "Core", "LLVM Precompilation"}},
 	{ emu_settings_type::EnableTSX,                { "Core", "Enable TSX"}},
 	{ emu_settings_type::AccurateSpuDMA,           { "Core", "Accurate SPU DMA"}},
@ -281,7 +281,7 @@ inline static const std::map<emu_settings_type, cfg_location> settings_location
 	{ emu_settings_type::DisableOnDiskShaderCache,   { "Video", "Disable On-Disk Shader Cache"}},
 	{ emu_settings_type::DisableVulkanMemAllocator,  { "Video", "Disable Vulkan Memory Allocator"}},
 	{ emu_settings_type::ShaderMode,                 { "Video", "Shader Mode"}},
-	{ emu_settings_type::ShaderCompilerNumThreads,   { "Video", "Shader Compiler Threads"}},
+	{ emu_settings_type::ShaderThreadsUsage,         { "Video", "Shader Compiler Threads Usage"}},
 	{ emu_settings_type::ShaderPrecisionQuality,     { "Video", "Shader Precision"}},
 	{ emu_settings_type::MultithreadedRSX,           { "Video", "Multithreaded RSX"}},
 	{ emu_settings_type::RelaxedZCULL,               { "Video", "Relaxed ZCULL Sync"}},
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -1793,13 +1793,13 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std

 	// Comboboxes

-	m_emu_settings->EnhanceComboBox(ui->maxLLVMThreads, emu_settings_type::MaxLLVMThreads, true, true, utils::get_thread_count());
-	SubscribeTooltip(ui->gb_max_llvm, tooltips.settings.max_llvm_threads);
-	ui->maxLLVMThreads->setItemText(ui->maxLLVMThreads->findData(0), tr("All (%1)", "Max LLVM Compile Threads").arg(utils::get_thread_count()));
+	m_emu_settings->EnhanceComboBox(ui->llvmThreadsUsage, emu_settings_type::LLVMThreadsUsage, true, true, static_cast<int>(g_cfg.core.llvm_threads_use_level.max));
+	SubscribeTooltip(ui->gb_max_llvm, tooltips.settings.llvm_threads_use_level);
+	ui->llvmThreadsUsage->setItemText(ui->llvmThreadsUsage->findData(0), tr("All Threads", "LLVM Compile Threads Use Level"));

-	m_emu_settings->EnhanceComboBox(ui->shaderCompilerThreads, emu_settings_type::ShaderCompilerNumThreads, true);
+	m_emu_settings->EnhanceComboBox(ui->shaderThreadsUsage, emu_settings_type::ShaderThreadsUsage, true, true, static_cast<int>(g_cfg.video.shader_threads_use_level.max));
 	SubscribeTooltip(ui->gb_shader_compiler_threads, tooltips.settings.shader_compiler_threads);
-	ui->shaderCompilerThreads->setItemText(ui->shaderCompilerThreads->findData(0), tr("Auto", "Max Shader Compile Threads"));
+	ui->shaderThreadsUsage->setItemText(ui->shaderThreadsUsage->findData(0), tr("Auto", "Max Shader Compile Threads"));

 	m_emu_settings->EnhanceComboBox(ui->perfOverlayDetailLevel, emu_settings_type::PerfOverlayDetailLevel);
 	SubscribeTooltip(ui->perf_overlay_detail_level, tooltips.settings.perf_overlay_detail_level);
--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@ -3037,11 +3037,11 @@
            <item>
             <widget class="QGroupBox" name="gb_max_llvm">
              <property name="title">
-               <string>Max LLVM Compile Threads</string>
+               <string>Max LLVM Threads Usage</string>
              </property>
              <layout class="QVBoxLayout" name="gb_max_llvm_layout">
               <item>
-                <widget class="QComboBox" name="maxLLVMThreads"/>
+                <widget class="QComboBox" name="llvmThreadsUsage"/>
               </item>
              </layout>
             </widget>
@ -3049,11 +3049,11 @@
            <item>
             <widget class="QGroupBox" name="gb_shader_compiler_threads">
              <property name="title">
-               <string>Max Shader Compile Threads</string>
+               <string>Max Shader Threads Usage</string>
              </property>
              <layout class="QVBoxLayout" name="gb_shader_compiler_threads_layout">
               <item>
-                <widget class="QComboBox" name="shaderCompilerThreads"/>
+                <widget class="QComboBox" name="shaderThreadsUsage"/>
               </item>
              </layout>
             </widget>
--- a/rpcs3/rpcs3qt/tooltips.h
+++ b/rpcs3/rpcs3qt/tooltips.h
@ -138,7 +138,7 @@ public:
 		const QString show_rpcn_popups             = tr("Show RPCN friend list pop-ups.");
 		const QString disable_mouse                = tr("Disables the activation of fullscreen mode per double-click while the game screen is active.\nCheck this if you want to play with mouse and keyboard (for example with UCR).");
 		const QString disable_kb_hotkeys           = tr("Disables keyboard hotkeys such as Ctrl+S, Ctrl+E, Ctrl+R, Ctrl+P while the game screen is active.\nThis does not include Ctrl+L (hide and lock mouse) and Alt+Enter (toggle fullscreen).\nCheck this if you want to play with mouse and keyboard.");
-		const QString max_llvm_threads             = tr("Limits the maximum number of threads used for the initial PPU and SPU module compilation.\nLower this in order to increase performance of other open applications.\nThe default uses all available threads.");
+		const QString llvm_threads_use_level       = tr("Limits the maximum number of threads used for the initial PPU and SPU module compilation.\nLower this in order to increase performance of other open applications.\nThe default uses all available threads.\nThe levels are relative and thus for CPUs with less than 8 threads some may behave the same.");
 		const QString show_mouse_in_fullscreen     = tr("Shows the mouse cursor when the fullscreen mode is active.\nCurrently this may not work every time.");
 		const QString lock_mouse_in_fullscreen     = tr("Locks the mouse cursor to the center when the fullscreen mode is active.");
 		const QString hide_mouse_on_idle           = tr("Hides the mouse cursor if no mouse movement is detected for the configured time.");
@ -191,7 +191,7 @@ public:
 		const QString async_shader_recompiler         = tr("This is the recommended option.\nIf a shader is not found in the cache, nothing will be rendered for this shader until it has compiled.\nYou may experience graphics pop-in.");
 		const QString async_with_shader_interpreter   = tr("Hybrid rendering mode.\nIf a shader is not found in the cache, the interpreter will be used to render approximated graphics for this shader until it has compiled.");
 		const QString shader_interpreter_only         = tr("All rendering is handled by the interpreter with no attempt to compile native shaders.\nThis mode is very slow and experimental.");
-		const QString shader_compiler_threads         = tr("Number of threads to use for the shader compiler backend.\nOnly has an impact when shader mode is set to one of the asynchronous modes.");
+		const QString shader_compiler_threads         = tr("Limits the maximum number of threads used for Shaders.\nOnly has an impact when shader mode is set to one of the asynchronous modes.\nThe levels are relative and thus for CPUs with less than 8 threads some may behave the same.");
 		const QString shader_precision                = tr("Controls the precision level of generated shaders. Low precision generates much faster code depending on the hardware, but can sometimes generate minor visual glitches or flicker.");

 		const QString async_texture_streaming   = tr("Stream textures to GPU in parallel with 3D rendering using asynchronous compute.\nCan improve performance on more powerful GPUs that have spare headroom.\nOnly works with Vulkan renderer and greatly benefits from having MTRSX enabled if you have a capable CPU.");