diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 8764f7f24a..147632d21b 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -1,6 +1,7 @@ #pragma once #include "Utilities/StrUtil.h" +#include "Emu/IdManager.h" #include "GLHelpers.h" namespace gl @@ -16,6 +17,28 @@ namespace gl bool unroll_loops = true; u32 optimal_group_size = 1; u32 optimal_kernel_size = 1; + u32 max_invocations_x = 65535; + + void initialize() + { + // Set up optimal kernel size + const auto& caps = gl::get_driver_caps(); + if (caps.vendor_AMD || caps.vendor_MESA) + { + optimal_group_size = 64; + unroll_loops = false; + } + else if (caps.vendor_NVIDIA) + { + optimal_group_size = 32; + } + else + { + optimal_group_size = 128; + } + + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, (GLint*)&max_invocations_x); + } void create() { @@ -52,6 +75,7 @@ namespace gl GLint old_program; glGetIntegerv(GL_CURRENT_PROGRAM, &old_program); + bind_resources(); m_program.use(); glDispatchCompute(invocations_x, invocations_y, 1); @@ -60,7 +84,23 @@ namespace gl void run(u32 num_invocations) { - run(num_invocations, 1); + u32 invocations_x, invocations_y; + if (LIKELY(num_invocations <= max_invocations_x)) + { + invocations_x = num_invocations; + invocations_y = 1; + } + else + { + // Since all the invocations will run, the optimal distribution is sqrt(count) + const auto optimal_length = (u32)floor(std::sqrt(num_invocations)); + invocations_x = optimal_length; + invocations_y = invocations_x; + + if (num_invocations % invocations_x) invocations_y++; + } + + run(invocations_x, invocations_y); } }; @@ -89,7 +129,7 @@ namespace gl void build(const char* function_name, u32 _kernel_size = 0) { // Initialize to allow detecting optimal settings - create(); + initialize(); kernel_size = _kernel_size? _kernel_size : optimal_kernel_size; @@ -107,15 +147,21 @@ namespace gl "#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n" "\n" "// Depth format conversions\n" - "#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n" - "#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n" - "#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n" - "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" - "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" + "#define d24x8_to_x8d24(bits) (bits << 8) | (bits >> 24)\n" + "#define d24x8_to_x8d24_swapped(bits) bswap_u32(d24x8_to_x8d24(bits))\n" + "#define x8d24_to_d24x8(bits) (bits >> 8) | (bits << 24)\n" + "#define x8d24_to_d24x8_swapped(bits) x8d24_to_d24x8(bswap_u32(bits))\n" + "\n" + "uint linear_invocation_id()\n" + "{\n" + " uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n" + " return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n" + "}\n" "\n" "void main()\n" "{\n" - " uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n" + " uint invocation_id = linear_invocation_id();\n" + " uint index = invocation_id * KERNEL_SIZE;\n" " uint value;\n" " %vars" "\n"; @@ -169,7 +215,7 @@ namespace gl void bind_resources() override { - m_data->bind_range(GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length); + m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length); } void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0) @@ -220,156 +266,35 @@ namespace gl } }; - struct cs_shuffle_d24x8_f32 : cs_shuffle_base + template + struct cs_shuffle_d24x8_to_x8d24 : cs_shuffle_base { - // convert d24x8 to f32 - cs_shuffle_d24x8_f32() + cs_shuffle_d24x8_to_x8d24() { - cs_shuffle_base::build("d24x8_to_f32"); + if constexpr (_SwapBytes) + { + cs_shuffle_base::build("d24x8_to_x8d24_swapped"); + } + else + { + cs_shuffle_base::build("d24x8_to_x8d24"); + } } }; - struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base + template + struct cs_shuffle_x8d24_to_d24x8 : cs_shuffle_base { - // convert f32 to d24x8 and swap endianness - cs_shuffle_se_f32_d24x8() + cs_shuffle_x8d24_to_d24x8() { - cs_shuffle_base::build("f32_to_d24x8_swapped"); - } - }; - - struct cs_shuffle_se_d24x8 : cs_shuffle_base - { - // swap endianness of d24x8 - cs_shuffle_se_d24x8() - { - cs_shuffle_base::build("d24x8_to_d24x8_swapped"); - } - }; - - // NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0 - struct cs_interleave_task : cs_shuffle_base - { - cs_interleave_task() - { - uniforms = - " uniform uint block_length;\n" - " uniform uint z_offset;\n" - " uniform uint s_offset;\n"; - - variables = - " uint depth;\n" - " uint stencil;\n" - " uint stencil_shift;\n" - " uint stencil_offset;\n"; - } - - void run(const gl::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) - { - m_program.uniforms["block_length"] = data_length; - m_program.uniforms["z_offset"] = zeta_offset - data_offset; - m_program.uniforms["s_offset"] = stencil_offset - data_offset; - cs_shuffle_base::run(data, data_length, data_offset); - } - }; - - template - struct cs_gather_d24x8 : cs_interleave_task - { - cs_gather_d24x8() - { - work_kernel = - " if (index >= block_length)\n" - " return;\n" - "\n" - " depth = data[index + z_offset] & 0x00FFFFFF;\n" - " stencil_offset = (index / 4);\n" - " stencil_shift = (index % 4) * 8;\n" - " stencil = data[stencil_offset + s_offset];\n" - " stencil = (stencil >> stencil_shift) & 0xFF;\n" - " value = (depth << 8) | stencil;\n"; - - if constexpr (!_SwapBytes) - { - work_kernel += - " data[index] = value;\n"; - } - else - { - work_kernel += - " data[index] = bswap_u32(value);\n"; - } - - cs_shuffle_base::build(""); - } - }; - - template - struct cs_gather_d32x8 : cs_interleave_task - { - cs_gather_d32x8() - { - work_kernel = - " if (index >= block_length)\n" - " return;\n" - "\n" - " depth = f32_to_d24(data[index + z_offset]);\n" - " stencil_offset = (index / 4);\n" - " stencil_shift = (index % 4) * 8;\n" - " stencil = data[stencil_offset + s_offset];\n" - " stencil = (stencil >> stencil_shift) & 0xFF;\n" - " value = (depth << 8) | stencil;\n"; - - if constexpr (!_SwapBytes) - { - work_kernel += - " data[index] = value;\n"; - } - else - { - work_kernel += - " data[index] = bswap_u32(value);\n"; - } - - cs_shuffle_base::build(""); - } - }; - - struct cs_scatter_d24x8 : cs_interleave_task - { - cs_scatter_d24x8() - { - work_kernel = - " if (index >= block_length)\n" - " return;\n" - "\n" - " value = data[index];\n" - " data[index + z_offset] = (value >> 8);\n" - " stencil_offset = (index / 4);\n" - " stencil_shift = (index % 4) * 8;\n" - " stencil = (value & 0xFF) << stencil_shift;\n" - " data[stencil_offset + s_offset] |= stencil;\n"; - - cs_shuffle_base::build(""); - } - }; - - struct cs_scatter_d32x8 : cs_interleave_task - { - cs_scatter_d32x8() - { - work_kernel = - " if (index >= block_length)\n" - " return;\n" - "\n" - " value = data[index];\n" - " data[index + z_offset] = d24_to_f32(value >> 8);\n" - " stencil_offset = (index / 4);\n" - " stencil_shift = (index % 4) * 8;\n" - " stencil = (value & 0xFF) << stencil_shift;\n" - " data[stencil_offset + s_offset] |= stencil;\n"; - - cs_shuffle_base::build(""); + if constexpr (_SwapBytes) + { + cs_shuffle_base::build("x8d24_to_d24x8_swapped"); + } + else + { + cs_shuffle_base::build("x8d24_to_d24x8"); + } } }; @@ -390,4 +315,6 @@ namespace gl return static_cast(e.get()); } + + void destroy_compute_tasks(); } \ No newline at end of file diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index ad73f5d029..84665ac916 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -2,6 +2,7 @@ #include "Emu/Memory/vm.h" #include "Emu/System.h" #include "GLGSRender.h" +#include "GLCompute.h" #include "GLVertexProgram.h" #include "../rsx_methods.h" #include "../Common/BufferUtils.h" @@ -965,6 +966,8 @@ void GLGSRender::on_init_thread() void GLGSRender::on_exit() { + gl::destroy_compute_tasks(); + zcull_ctrl.release(); m_prog_buffer.clear(); diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.cpp b/rpcs3/Emu/RSX/GL/GLHelpers.cpp index c45660ae35..d98c6bed83 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp +++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp @@ -30,6 +30,16 @@ namespace gl } } + void destroy_compute_tasks() + { + for (auto& [key, prog] : g_compute_tasks) + { + prog->destroy(); + } + + g_compute_tasks.clear(); + } + #ifdef WIN32 void APIENTRY dbgFunc(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei lenght, const GLchar* message, diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index ec7b25483b..e1a4c7eb76 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -2454,8 +2454,8 @@ public: case type::fragment: base_name = "shaderlog/FragmentProgram"; break; - case type::geometry: - base_name = "shaderlog/GeometryProgram"; + case type::compute: + base_name = "shaderlog/ComputeProgram"; break; } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index ac5e1f5932..194496b94b 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" #include "GLTexture.h" +#include "GLCompute.h" #include "../GCM.h" #include "../RSXThread.h" #include "../RSXTexture.h" @@ -90,43 +91,43 @@ namespace gl fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format); } - std::tuple get_format_type(texture::internal_format format) + pixel_buffer_layout get_format_type(texture::internal_format format) { switch (format) { case texture::internal_format::compressed_rgba_s3tc_dxt1: case texture::internal_format::compressed_rgba_s3tc_dxt3: case texture::internal_format::compressed_rgba_s3tc_dxt5: - return std::make_tuple(GL_RGBA, GL_UNSIGNED_BYTE, false); + return { GL_RGBA, GL_UNSIGNED_BYTE, 1, false }; case texture::internal_format::r8: - return std::make_tuple(GL_RED, GL_UNSIGNED_BYTE, false); + return { GL_RED, GL_UNSIGNED_BYTE, 1, false }; case texture::internal_format::r16: - return std::make_tuple(GL_RED, GL_UNSIGNED_SHORT, true); + return { GL_RED, GL_UNSIGNED_SHORT, 2, true }; case texture::internal_format::r32f: - return std::make_tuple(GL_RED, GL_FLOAT, true); + return { GL_RED, GL_FLOAT, 4, true }; case texture::internal_format::rg8: - return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE, false); + return { GL_RG, GL_UNSIGNED_BYTE, 1, false }; case texture::internal_format::rg16: - return std::make_tuple(GL_RG, GL_UNSIGNED_SHORT, true); + return { GL_RG, GL_UNSIGNED_SHORT, 2, true }; case texture::internal_format::rg16f: - return std::make_tuple(GL_RG, GL_HALF_FLOAT, true); + return { GL_RG, GL_HALF_FLOAT, 2, true }; case texture::internal_format::rgb565: - return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5, true); + return { GL_RGB, GL_UNSIGNED_SHORT_5_6_5, 2, true }; case texture::internal_format::rgb5a1: - return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, true); + return { GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, 2, true }; case texture::internal_format::rgba4: - return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, false); + return { GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, 2, false }; case texture::internal_format::rgba8: - return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, false); + return { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 4, false }; case texture::internal_format::rgba16f: - return std::make_tuple(GL_RGBA, GL_HALF_FLOAT, true); + return { GL_RGBA, GL_HALF_FLOAT, 2, true }; case texture::internal_format::rgba32f: - return std::make_tuple(GL_RGBA, GL_FLOAT, true); + return { GL_RGBA, GL_FLOAT, 4, true }; case texture::internal_format::depth16: - return std::make_tuple(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, true); + return { GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, 2, true }; case texture::internal_format::depth24_stencil8: case texture::internal_format::depth32f_stencil8: - return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, true); + return { GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 4, true }; default: fmt::throw_exception("Unexpected internal format 0x%X" HERE, (u32)format); } @@ -742,30 +743,113 @@ namespace gl GLsizeiptr src_mem = src->width() * src->height(); GLsizeiptr dst_mem = dst->width() * dst->height(); - GLenum buffer_copy_flag = GL_STATIC_COPY; - if (gl::get_driver_caps().vendor_MESA) buffer_copy_flag = GL_STREAM_COPY; - // NOTE: Mesa lacks acceleration for PBO unpacking and is currently fastest with GL_STREAM_COPY - // See https://bugs.freedesktop.org/show_bug.cgi?id=111043 - auto max_mem = std::max(src_mem, dst_mem) * 16; if (!g_typeless_transfer_buffer || max_mem > g_typeless_transfer_buffer.size()) { if (g_typeless_transfer_buffer) g_typeless_transfer_buffer.remove(); - g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, buffer_copy_flag); + g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); } - auto format_type = get_format_type(src->get_internal_format()); + const auto pack_info = get_format_type(src->get_internal_format()); + const auto unpack_info = get_format_type(dst->get_internal_format()); + pixel_pack_settings pack_settings{}; - pack_settings.swap_bytes(std::get<2>(format_type)); g_typeless_transfer_buffer.bind(buffer::target::pixel_pack); - src->copy_to(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), pack_settings); + src->copy_to(nullptr, (texture::format)pack_info.format, (texture::type)pack_info.type, pack_settings); glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - format_type = get_format_type(dst->get_internal_format()); + const bool src_is_ds = !!(src->aspect() & gl::image_aspect::stencil); + const bool dst_is_ds = !!(src->aspect() & gl::image_aspect::stencil); + + if (pack_info.swap_bytes || unpack_info.swap_bytes || src_is_ds || dst_is_ds) + { + gl::cs_shuffle_base *src_transform = nullptr, *dst_transform = nullptr; + + if (src_is_ds) + { + if (pack_info.swap_bytes) + { + src_transform = gl::get_compute_task>(); + } + else + { + src_transform = gl::get_compute_task>(); + } + } + else if (pack_info.swap_bytes) + { + switch (pack_info.size) + { + case 1: + break; + case 2: + src_transform = gl::get_compute_task(); + break; + case 4: + src_transform = gl::get_compute_task(); + break; + default: + fmt::throw_exception("Unsupported format"); + } + } + + if (dst_is_ds) + { + if (unpack_info.swap_bytes) + { + dst_transform = gl::get_compute_task>(); + } + else + { + dst_transform = gl::get_compute_task>(); + } + } + else if (unpack_info.swap_bytes) + { + switch (unpack_info.size) + { + case 1: + break; + case 2: + dst_transform = gl::get_compute_task(); + break; + case 4: + dst_transform = gl::get_compute_task(); + break; + default: + fmt::throw_exception("Unsupported format"); + } + + if (!src_is_ds) + { + if (src_transform == dst_transform) + { + src_transform = dst_transform = nullptr; + } + else if (src_transform) + { + src_transform = gl::get_compute_task(); + dst_transform = nullptr; + } + } + + if (src_transform) + { + const auto image_size = src->pitch() * src->height(); + src_transform->run(&g_typeless_transfer_buffer, image_size); + } + + if (dst_transform) + { + const auto image_size = dst->pitch() * dst->height(); + dst_transform->run(&g_typeless_transfer_buffer, image_size); + } + } + } + pixel_unpack_settings unpack_settings{}; - unpack_settings.swap_bytes(std::get<2>(format_type)); g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack); - dst->copy_from(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), unpack_settings); + dst->copy_from(nullptr, (texture::format)unpack_info.format, (texture::type)unpack_info.type, unpack_settings); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); } } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.h b/rpcs3/Emu/RSX/GL/GLTexture.h index 4168353560..cba49bc068 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.h +++ b/rpcs3/Emu/RSX/GL/GLTexture.h @@ -13,10 +13,18 @@ namespace rsx namespace gl { + struct pixel_buffer_layout + { + GLenum format; + GLenum type; + u8 size; + bool swap_bytes; + }; + GLenum get_target(rsx::texture_dimension_extended type); GLenum get_sized_internal_format(u32 texture_format); std::tuple get_format_type(u32 texture_format); - std::tuple get_format_type(texture::internal_format format); + pixel_buffer_layout get_format_type(texture::internal_format format); GLenum wrap_mode(rsx::texture_wrap_mode wrap); float max_aniso(rsx::texture_max_anisotropy aniso); std::array get_swizzle_remap(u32 texture_format); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index e0dfc11c53..739c4af054 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -163,12 +163,12 @@ namespace gl { // Determine unpack config dynamically const auto format_info = gl::get_format_type(src->get_internal_format()); - format = static_cast(std::get<0>(format_info)); - type = static_cast(std::get<1>(format_info)); + format = static_cast(format_info.format); + type = static_cast(format_info.type); if ((src->aspect() & gl::image_aspect::stencil) == 0) { - pack_unpack_swap_bytes = std::get<2>(format_info); + pack_unpack_swap_bytes = format_info.swap_bytes; } else {