gl: Use compute shaders for typeless texture decode

2025-04-21 03:55:32 +00:00 · 2019-10-02 03:47:19 +03:00 · 2019-10-02 03:47:19 +03:00 · 105d4b51e6
commit 105d4b51e6
parent 7a6e2e716f
7 changed files with 218 additions and 186 deletions
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -1,6 +1,7 @@
 #pragma once

 #include "Utilities/StrUtil.h"
+#include "Emu/IdManager.h"
 #include "GLHelpers.h"

 namespace gl
@ -16,6 +17,28 @@ namespace gl
 		bool unroll_loops = true;
 		u32 optimal_group_size = 1;
 		u32 optimal_kernel_size = 1;
+		u32 max_invocations_x = 65535;
+
+		void initialize()
+		{
+			// Set up optimal kernel size
+			const auto& caps = gl::get_driver_caps();
+			if (caps.vendor_AMD || caps.vendor_MESA)
+			{
+				optimal_group_size = 64;
+				unroll_loops = false;
+			}
+			else if (caps.vendor_NVIDIA)
+			{
+				optimal_group_size = 32;
+			}
+			else
+			{
+				optimal_group_size = 128;
+			}
+
+			glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, (GLint*)&max_invocations_x);
+		}

        void create()
        {
@ -52,6 +75,7 @@ namespace gl
            GLint old_program;
            glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);

+			bind_resources();
            m_program.use();
            glDispatchCompute(invocations_x, invocations_y, 1);

@ -60,7 +84,23 @@ namespace gl

        void run(u32 num_invocations)
        {
-            run(num_invocations, 1);   
+			u32 invocations_x, invocations_y;
+			if (LIKELY(num_invocations <= max_invocations_x))
+			{
+				invocations_x = num_invocations;
+				invocations_y = 1;
+			}
+			else
+			{
+				// Since all the invocations will run, the optimal distribution is sqrt(count)
+				const auto optimal_length = (u32)floor(std::sqrt(num_invocations));
+				invocations_x = optimal_length;
+				invocations_y = invocations_x;
+
+				if (num_invocations % invocations_x) invocations_y++;
+			}
+
+            run(invocations_x, invocations_y);
        }
    };

@ -89,7 +129,7 @@ namespace gl
 		void build(const char* function_name, u32 _kernel_size = 0)
 		{
 			// Initialize to allow detecting optimal settings
-			create();
+			initialize();

 			kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;

@ -107,15 +147,21 @@ namespace gl
 				"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
 				"\n"
 				"// Depth format conversions\n"
-				"#define d24_to_f32(bits)             floatBitsToUint(float(bits) / 16777215.f)\n"
-				"#define f32_to_d24(bits)             uint(uintBitsToFloat(bits) * 16777215.f)\n"
-				"#define d24x8_to_f32(bits)           d24_to_f32(bits >> 8)\n"
-				"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
-				"#define f32_to_d24x8_swapped(bits)   d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
+                "#define d24x8_to_x8d24(bits) (bits << 8) | (bits >> 24)\n"
+                "#define d24x8_to_x8d24_swapped(bits) bswap_u32(d24x8_to_x8d24(bits))\n"
+                "#define x8d24_to_d24x8(bits) (bits >> 8) | (bits << 24)\n"
+                "#define x8d24_to_d24x8_swapped(bits) x8d24_to_d24x8(bswap_u32(bits))\n"
+				"\n"
+				"uint linear_invocation_id()\n"
+				"{\n"
+				"	uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
+				"	return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
+				"}\n"
 				"\n"
 				"void main()\n"
 				"{\n"
-				"	uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
+				"	uint invocation_id = linear_invocation_id();\n"
+				"	uint index = invocation_id * KERNEL_SIZE;\n"
 				"	uint value;\n"
 				"	%vars"
 				"\n";
@ -169,7 +215,7 @@ namespace gl

 		void bind_resources() override
 		{
-            m_data->bind_range(GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
+            m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
 		}

 		void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
@ -220,156 +266,35 @@ namespace gl
 		}
 	};

-	struct cs_shuffle_d24x8_f32 : cs_shuffle_base
+    template<bool _SwapBytes = false>
+	struct cs_shuffle_d24x8_to_x8d24 : cs_shuffle_base
 	{
-		// convert d24x8 to f32
-		cs_shuffle_d24x8_f32()
+		cs_shuffle_d24x8_to_x8d24()
 		{
-			cs_shuffle_base::build("d24x8_to_f32");
+            if constexpr (_SwapBytes)
+            {
+			    cs_shuffle_base::build("d24x8_to_x8d24_swapped");
+            }
+            else
+            {
+			    cs_shuffle_base::build("d24x8_to_x8d24");
+            }
 		}
 	};

-	struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
+    template<bool _SwapBytes = false>
+	struct cs_shuffle_x8d24_to_d24x8 : cs_shuffle_base
 	{
-		// convert f32 to d24x8 and swap endianness
-		cs_shuffle_se_f32_d24x8()
+		cs_shuffle_x8d24_to_d24x8()
 		{
-			cs_shuffle_base::build("f32_to_d24x8_swapped");
-		}
-	};
-
-	struct cs_shuffle_se_d24x8 : cs_shuffle_base
-	{
-		// swap endianness of d24x8
-		cs_shuffle_se_d24x8()
-		{
-			cs_shuffle_base::build("d24x8_to_d24x8_swapped");
-		}
-	};
-
-	// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
-	struct cs_interleave_task : cs_shuffle_base
-	{
-		cs_interleave_task()
-		{
-            uniforms =
-            "   uniform uint block_length;\n"
-            "   uniform uint z_offset;\n"
-            "   uniform uint s_offset;\n";
-
-			variables =
-				"	uint depth;\n"
-				"	uint stencil;\n"
-				"	uint stencil_shift;\n"
-				"	uint stencil_offset;\n";
-		}
-
-		void run(const gl::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
-		{
-            m_program.uniforms["block_length"] = data_length;
-            m_program.uniforms["z_offset"] = zeta_offset - data_offset;
-            m_program.uniforms["s_offset"] = stencil_offset - data_offset;
-			cs_shuffle_base::run(data, data_length, data_offset);
-		}
-	};
-
-	template<bool _SwapBytes = false>
-	struct cs_gather_d24x8 : cs_interleave_task
-	{
-		cs_gather_d24x8()
-		{
-			work_kernel =
-				"		if (index >= block_length)\n"
-				"			return;\n"
-				"\n"
-				"		depth = data[index + z_offset] & 0x00FFFFFF;\n"
-				"		stencil_offset = (index / 4);\n"
-				"		stencil_shift = (index % 4) * 8;\n"
-				"		stencil = data[stencil_offset + s_offset];\n"
-				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-				"		value = (depth << 8) | stencil;\n";
-
-			if constexpr (!_SwapBytes)
-			{
-				work_kernel +=
-				"		data[index] = value;\n";
-			}
-			else
-			{
-				work_kernel +=
-				"		data[index] = bswap_u32(value);\n";
-			}
-
-			cs_shuffle_base::build("");
-		}
-	};
-
-	template<bool _SwapBytes = false>
-	struct cs_gather_d32x8 : cs_interleave_task
-	{
-		cs_gather_d32x8()
-		{
-			work_kernel =
-				"		if (index >= block_length)\n"
-				"			return;\n"
-				"\n"
-				"		depth = f32_to_d24(data[index + z_offset]);\n"
-				"		stencil_offset = (index / 4);\n"
-				"		stencil_shift = (index % 4) * 8;\n"
-				"		stencil = data[stencil_offset + s_offset];\n"
-				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-				"		value = (depth << 8) | stencil;\n";
-
-			if constexpr (!_SwapBytes)
-			{
-				work_kernel +=
-				"		data[index] = value;\n";
-			}
-			else
-			{
-				work_kernel +=
-				"		data[index] = bswap_u32(value);\n";
-			}
-
-			cs_shuffle_base::build("");
-		}
-	};
-
-	struct cs_scatter_d24x8 : cs_interleave_task
-	{
-		cs_scatter_d24x8()
-		{
-			work_kernel =
-				"		if (index >= block_length)\n"
-				"			return;\n"
-				"\n"
-				"		value = data[index];\n"
-				"		data[index + z_offset] = (value >> 8);\n"
-				"		stencil_offset = (index / 4);\n"
-				"		stencil_shift = (index % 4) * 8;\n"
-				"		stencil = (value & 0xFF) << stencil_shift;\n"
-				"		data[stencil_offset + s_offset] |= stencil;\n";
-
-			cs_shuffle_base::build("");
-		}
-	};
-
-	struct cs_scatter_d32x8 : cs_interleave_task
-	{
-		cs_scatter_d32x8()
-		{
-			work_kernel =
-				"		if (index >= block_length)\n"
-				"			return;\n"
-				"\n"
-				"		value = data[index];\n"
-				"		data[index + z_offset] = d24_to_f32(value >> 8);\n"
-				"		stencil_offset = (index / 4);\n"
-				"		stencil_shift = (index % 4) * 8;\n"
-				"		stencil = (value & 0xFF) << stencil_shift;\n"
-				"		data[stencil_offset + s_offset] |= stencil;\n";
-
-			cs_shuffle_base::build("");
+            if constexpr (_SwapBytes)
+            {
+			    cs_shuffle_base::build("x8d24_to_d24x8_swapped");
+            }
+            else
+            {
+			    cs_shuffle_base::build("x8d24_to_d24x8");
+            }
 		}
 	};

@ -390,4 +315,6 @@ namespace gl

 		return static_cast<T*>(e.get());
 	}
+
+	void destroy_compute_tasks();
 }
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -2,6 +2,7 @@
 #include "Emu/Memory/vm.h"
 #include "Emu/System.h"
 #include "GLGSRender.h"
+#include "GLCompute.h"
 #include "GLVertexProgram.h"
 #include "../rsx_methods.h"
 #include "../Common/BufferUtils.h"
@ -965,6 +966,8 @@ void GLGSRender::on_init_thread()

 void GLGSRender::on_exit()
 {
+	gl::destroy_compute_tasks();
+
 	zcull_ctrl.release();

 	m_prog_buffer.clear();
--- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp
@ -30,6 +30,16 @@ namespace gl
 		}
 	}

+	void destroy_compute_tasks()
+	{
+		for (auto& [key, prog] : g_compute_tasks)
+		{
+			prog->destroy();
+		}
+
+		g_compute_tasks.clear();
+	}
+
 #ifdef WIN32
 	void APIENTRY dbgFunc(GLenum source, GLenum type, GLuint id,
 		GLenum severity, GLsizei lenght, const GLchar* message,
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -2454,8 +2454,8 @@ public:
 					case type::fragment:
 						base_name = "shaderlog/FragmentProgram";
 						break;
-					case type::geometry:
-						base_name = "shaderlog/GeometryProgram";
+					case type::compute:
+						base_name = "shaderlog/ComputeProgram";
 						break;
 					}

--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -1,5 +1,6 @@
 #include "stdafx.h"
 #include "GLTexture.h"
+#include "GLCompute.h"
 #include "../GCM.h"
 #include "../RSXThread.h"
 #include "../RSXTexture.h"
@ -90,43 +91,43 @@ namespace gl
 		fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format);
 	}

-	std::tuple<GLenum, GLenum, bool> get_format_type(texture::internal_format format)
+	pixel_buffer_layout get_format_type(texture::internal_format format)
 	{
 		switch (format)
 		{
 		case texture::internal_format::compressed_rgba_s3tc_dxt1:
 		case texture::internal_format::compressed_rgba_s3tc_dxt3:
 		case texture::internal_format::compressed_rgba_s3tc_dxt5:
-			return std::make_tuple(GL_RGBA, GL_UNSIGNED_BYTE, false);
+			return { GL_RGBA, GL_UNSIGNED_BYTE, 1, false };
 		case texture::internal_format::r8:
-			return std::make_tuple(GL_RED, GL_UNSIGNED_BYTE, false);
+			return { GL_RED, GL_UNSIGNED_BYTE, 1, false };
 		case texture::internal_format::r16:
-			return std::make_tuple(GL_RED, GL_UNSIGNED_SHORT, true);
+			return { GL_RED, GL_UNSIGNED_SHORT, 2, true };
 		case texture::internal_format::r32f:
-			return std::make_tuple(GL_RED, GL_FLOAT, true);
+			return { GL_RED, GL_FLOAT, 4, true };
 		case texture::internal_format::rg8:
-			return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE, false);
+			return { GL_RG, GL_UNSIGNED_BYTE, 1, false };
 		case texture::internal_format::rg16:
-			return std::make_tuple(GL_RG, GL_UNSIGNED_SHORT, true);
+			return { GL_RG, GL_UNSIGNED_SHORT, 2, true };
 		case texture::internal_format::rg16f:
-			return std::make_tuple(GL_RG, GL_HALF_FLOAT, true);
+			return { GL_RG, GL_HALF_FLOAT, 2, true };
 		case texture::internal_format::rgb565:
-			return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5, true);
+			return { GL_RGB, GL_UNSIGNED_SHORT_5_6_5, 2, true };
 		case texture::internal_format::rgb5a1:
-			return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, true);
+			return { GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, 2, true };
 		case texture::internal_format::rgba4:
-			return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, false);
+			return { GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, 2, false };
 		case texture::internal_format::rgba8:
-			return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, false);
+			return { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 4, false };
 		case texture::internal_format::rgba16f:
-			return std::make_tuple(GL_RGBA, GL_HALF_FLOAT, true);
+			return { GL_RGBA, GL_HALF_FLOAT, 2, true };
 		case texture::internal_format::rgba32f:
-			return std::make_tuple(GL_RGBA, GL_FLOAT, true);
+			return { GL_RGBA, GL_FLOAT, 4, true };
 		case texture::internal_format::depth16:
-			return std::make_tuple(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, true);
+			return { GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, 2, true };
 		case texture::internal_format::depth24_stencil8:
 		case texture::internal_format::depth32f_stencil8:
-			return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, true);
+			return { GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 4, true };
 		default:
 			fmt::throw_exception("Unexpected internal format 0x%X" HERE, (u32)format);
 		}
@ -742,30 +743,113 @@ namespace gl
 		GLsizeiptr src_mem = src->width() * src->height();
 		GLsizeiptr dst_mem = dst->width() * dst->height();

-		GLenum buffer_copy_flag = GL_STATIC_COPY;
-		if (gl::get_driver_caps().vendor_MESA) buffer_copy_flag = GL_STREAM_COPY;
-		// NOTE: Mesa lacks acceleration for PBO unpacking and is currently fastest with GL_STREAM_COPY
-		// See https://bugs.freedesktop.org/show_bug.cgi?id=111043
-
 		auto max_mem = std::max(src_mem, dst_mem) * 16;
 		if (!g_typeless_transfer_buffer || max_mem > g_typeless_transfer_buffer.size())
 		{
 			if (g_typeless_transfer_buffer) g_typeless_transfer_buffer.remove();
-			g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, buffer_copy_flag);
+			g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
 		}

-		auto format_type = get_format_type(src->get_internal_format());
+		const auto pack_info = get_format_type(src->get_internal_format());
+		const auto unpack_info = get_format_type(dst->get_internal_format());
+
 		pixel_pack_settings pack_settings{};
-		pack_settings.swap_bytes(std::get<2>(format_type));
 		g_typeless_transfer_buffer.bind(buffer::target::pixel_pack);
-		src->copy_to(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), pack_settings);
+		src->copy_to(nullptr, (texture::format)pack_info.format, (texture::type)pack_info.type, pack_settings);
 		glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);

-		format_type = get_format_type(dst->get_internal_format());
+		const bool src_is_ds = !!(src->aspect() & gl::image_aspect::stencil);
+		const bool dst_is_ds = !!(src->aspect() & gl::image_aspect::stencil);
+
+		if (pack_info.swap_bytes || unpack_info.swap_bytes || src_is_ds || dst_is_ds)
+		{
+			gl::cs_shuffle_base *src_transform = nullptr, *dst_transform = nullptr;
+
+			if (src_is_ds)
+			{
+				if (pack_info.swap_bytes)
+				{
+					src_transform = gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<true>>();
+				}
+				else
+				{
+					src_transform = gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<false>>();
+				}
+			}
+			else if (pack_info.swap_bytes)
+			{
+				switch (pack_info.size)
+				{
+				case 1:
+					break;
+				case 2:
+					src_transform = gl::get_compute_task<gl::cs_shuffle_16>();
+					break;
+				case 4:
+					src_transform = gl::get_compute_task<gl::cs_shuffle_32>();
+					break;
+				default:
+					fmt::throw_exception("Unsupported format");
+				}
+			}
+
+			if (dst_is_ds)
+			{
+				if (unpack_info.swap_bytes)
+				{
+					dst_transform = gl::get_compute_task<gl::cs_shuffle_x8d24_to_d24x8<true>>();
+				}
+				else
+				{
+					dst_transform = gl::get_compute_task<gl::cs_shuffle_x8d24_to_d24x8<false>>();
+				}
+			}
+			else if (unpack_info.swap_bytes)
+			{
+				switch (unpack_info.size)
+				{
+				case 1:
+					break;
+				case 2:
+					dst_transform = gl::get_compute_task<gl::cs_shuffle_16>();
+					break;
+				case 4:
+					dst_transform = gl::get_compute_task<gl::cs_shuffle_32>();
+					break;
+				default:
+					fmt::throw_exception("Unsupported format");
+				}
+
+				if (!src_is_ds)
+				{
+					if (src_transform == dst_transform)
+					{
+						src_transform = dst_transform = nullptr;
+					}
+					else if (src_transform)
+					{
+						src_transform = gl::get_compute_task<gl::cs_shuffle_32_16>();
+						dst_transform = nullptr;
+					}
+				}
+
+				if (src_transform)
+				{
+					const auto image_size = src->pitch() * src->height();
+					src_transform->run(&g_typeless_transfer_buffer, image_size);
+				}
+
+				if (dst_transform)
+				{
+					const auto image_size = dst->pitch() * dst->height();
+					dst_transform->run(&g_typeless_transfer_buffer, image_size);
+				}
+			}
+		}
+
 		pixel_unpack_settings unpack_settings{};
-		unpack_settings.swap_bytes(std::get<2>(format_type));
 		g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack);
-		dst->copy_from(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), unpack_settings);
+		dst->copy_from(nullptr, (texture::format)unpack_info.format, (texture::type)unpack_info.type, unpack_settings);
 		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE);
 	}
 }
--- a/rpcs3/Emu/RSX/GL/GLTexture.h
+++ b/rpcs3/Emu/RSX/GL/GLTexture.h
@ -13,10 +13,18 @@ namespace rsx

 namespace gl
 {
+	struct pixel_buffer_layout
+	{
+		GLenum format;
+		GLenum type;
+		u8     size;
+		bool   swap_bytes;
+	};
+
 	GLenum get_target(rsx::texture_dimension_extended type);
 	GLenum get_sized_internal_format(u32 texture_format);
 	std::tuple<GLenum, GLenum> get_format_type(u32 texture_format);
-	std::tuple<GLenum, GLenum, bool> get_format_type(texture::internal_format format);
+	pixel_buffer_layout get_format_type(texture::internal_format format);
 	GLenum wrap_mode(rsx::texture_wrap_mode wrap);
 	float max_aniso(rsx::texture_max_anisotropy aniso);
 	std::array<GLenum, 4> get_swizzle_remap(u32 texture_format);
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -163,12 +163,12 @@ namespace gl
 			{
 				// Determine unpack config dynamically
 				const auto format_info = gl::get_format_type(src->get_internal_format());
-				format = static_cast<gl::texture::format>(std::get<0>(format_info));
-				type = static_cast<gl::texture::type>(std::get<1>(format_info));
+				format = static_cast<gl::texture::format>(format_info.format);
+				type = static_cast<gl::texture::type>(format_info.type);

 				if ((src->aspect() & gl::image_aspect::stencil) == 0)
 				{
-					pack_unpack_swap_bytes = std::get<2>(format_info);
+					pack_unpack_swap_bytes = format_info.swap_bytes;
 				}
 				else
 				{