From a6e6df14459b2e21c992b4a16df2a6a257f934fb Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 1 Jun 2022 21:56:33 +0300 Subject: [PATCH] gl: Implement fast texture readback for D24X8 and RGBA8/BGRA8 --- rpcs3/Emu/RSX/GL/GLCompute.cpp | 79 +++++++++++++++ rpcs3/Emu/RSX/GL/GLCompute.h | 19 ++++ rpcs3/Emu/RSX/GL/GLHelpers.h | 10 ++ rpcs3/Emu/RSX/GL/GLOverlays.cpp | 8 +- rpcs3/Emu/RSX/GL/GLTexture.cpp | 24 +++++ .../GLSLSnippets/CopyBufferToD24x8.glsl | 8 +- .../GLSLSnippets/CopyD24x8ToBuffer.glsl | 74 ++++++++++++++ .../GLSLSnippets/CopyRGBA8ToBuffer.glsl | 96 +++++++++++++++++++ rpcs3/emucore.vcxproj | 2 + rpcs3/emucore.vcxproj.filters | 6 ++ 10 files changed, 318 insertions(+), 8 deletions(-) create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/CopyRGBA8ToBuffer.glsl diff --git a/rpcs3/Emu/RSX/GL/GLCompute.cpp b/rpcs3/Emu/RSX/GL/GLCompute.cpp index d88b66cdce..79bff90c30 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.cpp +++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp @@ -1,4 +1,5 @@ #include "GLCompute.h" +#include "GLTexture.h" #include "Utilities/StrUtil.h" namespace gl @@ -272,4 +273,82 @@ namespace gl m_program.uniforms["out_ptr"] = dst_offset - data_offset; cs_shuffle_base::run(cmd, data, num_texels * 4, data_offset); } + + cs_d24x8_to_ssbo::cs_d24x8_to_ssbo() + { + initialize(); + + const auto raw_data = + #include "../Program/GLSLSnippets/CopyD24x8ToBuffer.glsl" + ; + + const std::pair repl_list[] = + { + { "%set, ", "" }, + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, + { "%ws", std::to_string(optimal_group_size) }, + { "%wks", std::to_string(optimal_kernel_size) } + }; + + m_src = fmt::replace_all(raw_data, repl_list); + } + + void cs_d24x8_to_ssbo::run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& /*layout*/, const gl::pixel_pack_settings& settings) + { + const auto row_pitch = settings.get_row_length() ? settings.get_row_length() : region.width; + + m_program.uniforms["swap_bytes"] = settings.get_swap_bytes(); + m_program.uniforms["output_pitch"] = row_pitch; + m_program.uniforms["region_offset"] = color2i(region.x, region.y); + m_program.uniforms["region_size"] = color2i(region.width, region.height); + + auto depth_view = src->get_view(0xAAE4, rsx::default_remap_vector, gl::image_aspect::depth); + auto stencil_view = src->get_view(0xAAE4, rsx::default_remap_vector, gl::image_aspect::stencil); + + depth_view->bind(cmd, GL_COMPUTE_BUFFER_SLOT(0)); + stencil_view->bind(cmd, GL_COMPUTE_BUFFER_SLOT(1)); + dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(2), out_offset, row_pitch * 4 * region.height); + + const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size); + compute_task::run(cmd, num_invocations); + } + + cs_rgba8_to_ssbo::cs_rgba8_to_ssbo() + { + initialize(); + + const auto raw_data = + #include "../Program/GLSLSnippets/CopyRGBA8ToBuffer.glsl" + ; + + const std::pair repl_list[] = + { + { "%set, ", "" }, + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, + { "%ws", std::to_string(optimal_group_size) }, + { "%wks", std::to_string(optimal_kernel_size) } + }; + + m_src = fmt::replace_all(raw_data, repl_list); + } + + void cs_rgba8_to_ssbo::run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& layout, const gl::pixel_pack_settings& settings) + { + const auto row_pitch = settings.get_row_length() ? settings.get_row_length() : region.width; + + m_program.uniforms["swap_bytes"] = settings.get_swap_bytes(); + m_program.uniforms["output_pitch"] = row_pitch; + m_program.uniforms["region_offset"] = color2i(region.x, region.y); + m_program.uniforms["region_size"] = color2i(region.width, region.height); + m_program.uniforms["is_bgra"] = (layout.format == static_cast(gl::texture::format::bgra)); + m_program.uniforms["block_width"] = static_cast(layout.size); + + auto data_view = src->get_view(0xAAE4, rsx::default_remap_vector, gl::image_aspect::color); + + data_view->bind(cmd, GL_COMPUTE_BUFFER_SLOT(0)); + dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), out_offset, row_pitch * 4 * region.height); + + const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size); + compute_task::run(cmd, num_invocations); + } } diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 4021722ee3..b2fc3c78db 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -340,6 +340,25 @@ namespace gl } }; + struct pixel_buffer_layout; + + struct cs_image_to_ssbo : compute_task + { + virtual void run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& layout, const gl::pixel_pack_settings& settings) = 0; + }; + + struct cs_d24x8_to_ssbo : cs_image_to_ssbo + { + cs_d24x8_to_ssbo(); + void run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& layout, const gl::pixel_pack_settings& settings) override; + }; + + struct cs_rgba8_to_ssbo : cs_image_to_ssbo + { + cs_rgba8_to_ssbo(); + void run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& layout, const gl::pixel_pack_settings& settings) override; + }; + // TODO: Replace with a proper manager extern std::unordered_map> g_compute_tasks; diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index 5c8dfc5142..b73a4640ac 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -336,6 +336,15 @@ namespace gl m_alignment = value; return *this; } + + bool get_swap_bytes() const + { + return m_swap_bytes; + } + int get_row_length() const + { + return m_row_length; + } }; class pixel_unpack_settings @@ -2558,6 +2567,7 @@ public: void operator = (int rhs) const { glProgramUniform1i(m_program.id(), location(), rhs); } void operator = (unsigned rhs) const { glProgramUniform1ui(m_program.id(), location(), rhs); } void operator = (float rhs) const { glProgramUniform1f(m_program.id(), location(), rhs); } + void operator = (bool rhs) const { glProgramUniform1ui(m_program.id(), location(), rhs ? 1 : 0); } void operator = (const color1i& rhs) const { glProgramUniform1i(m_program.id(), location(), rhs.r); } void operator = (const color1f& rhs) const { glProgramUniform1f(m_program.id(), location(), rhs.r); } void operator = (const color2i& rhs) const { glProgramUniform2i(m_program.id(), location(), rhs.r, rhs.g); } diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.cpp b/rpcs3/Emu/RSX/GL/GLOverlays.cpp index a6ec591b26..ede1a8e84e 100644 --- a/rpcs3/Emu/RSX/GL/GLOverlays.cpp +++ b/rpcs3/Emu/RSX/GL/GLOverlays.cpp @@ -608,10 +608,10 @@ namespace gl const u32 src_offset, const coordu& dst_region, const pixel_unpack_settings& settings) { - const int row_length = settings.get_row_length(); - program_handle.uniforms["src_pitch"] = row_length ? row_length : static_cast(dst_region.width); - program_handle.uniforms["swap_bytes"] = settings.get_swap_bytes() ? 1 : 0; - src->bind_range(GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * dst_region.height); + const u32 row_length = settings.get_row_length() ? settings.get_row_length() : static_cast(dst_region.width); + program_handle.uniforms["src_pitch"] = row_length; + program_handle.uniforms["swap_bytes"] = settings.get_swap_bytes(); + src->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * 4 * dst_region.height); cmd->stencil_mask(0xFF); diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 98ec717349..78ed8c8bf4 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -508,6 +508,30 @@ namespace gl dst->create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); } + if (auto as_vi = dynamic_cast(src); + gl::get_driver_caps().vendor_AMD && + src->get_target() == gl::texture::target::texture2D && + as_vi) + { + switch (src->get_internal_format()) + { + case gl::texture::internal_format::depth24_stencil8: + gl::get_compute_task()->run(cmd, + const_cast(as_vi), dst, 0, + { {src_region.x, src_region.y}, {src_region.width, src_region.height} }, + pack_info, {}); + return; + case gl::texture::internal_format::rgba8: + gl::get_compute_task()->run(cmd, + const_cast(as_vi), dst, 0, + { {src_region.x, src_region.y}, {src_region.width, src_region.height} }, + pack_info, {}); + return; + default: + break; + } + } + dst->bind(buffer::target::pixel_pack); src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), src_level, src_region, {}); }; diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl index 9ff841fb90..f089add963 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl @@ -14,11 +14,11 @@ layout(%push_block) uniform UnpackConfiguration uint src_pitch; }; #else - uniform int swap_bytes; - uniform int src_pitch; + uniform uint swap_bytes; + uniform uint src_pitch; #endif -int getDataOffset() +uint getDataOffset() { const ivec2 coords = ivec2(gl_FragCoord.xy); return coords.y * src_pitch + coords.x; @@ -26,7 +26,7 @@ int getDataOffset() void main() { - const int virtual_address = getDataOffset(); + const uint virtual_address = getDataOffset(); uint real_data = data[virtual_address]; const uint stencil_byte = bitfieldExtract(real_data, 0, 8); diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl new file mode 100644 index 0000000000..9ae21bfd7c --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyD24x8ToBuffer.glsl @@ -0,0 +1,74 @@ +R"( +#version 450 +layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; + +#define IMAGE_LOCATION(x) (x + %loc) +#define SSBO_LOCATION IMAGE_LOCATION(2) + +layout(%set, binding=IMAGE_LOCATION(0)) uniform sampler2D depthData; +layout(%set, binding=IMAGE_LOCATION(1)) uniform usampler2D stencilData; + +layout(%set, binding=SSBO_LOCATION, std430) writeonly restrict buffer OutputBlock +{ + uint data[]; +}; + +#if USE_UBO +layout(%push_block) uniform Configuration +{ + uint swap_bytes; + uint output_pitch; + ivec2 region_offset; + ivec2 region_size; +}; +#else + uniform uint swap_bytes; + uniform uint output_pitch; + uniform ivec2 region_offset; + uniform ivec2 region_size; +#endif + +#define KERNEL_SIZE %wks + +uint linear_invocation_id() +{ + uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x); + return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x; +} + +ivec2 linear_id_to_input_coord(uint index) +{ + return ivec2(int(index % region_size.x), int(index / output_pitch)) + region_offset; +} + +uint input_coord_to_output_id(ivec2 coord) +{ + coord -= region_offset; + return coord.y * output_pitch + coord.x; +} + +void main() +{ + uint index = linear_invocation_id() * KERNEL_SIZE; + + for (int loop = 0; loop < KERNEL_SIZE; ++loop, ++index) + { + if (index > (region_size.x * region_size.y)) + { + return; + } + + ivec2 coord = linear_id_to_input_coord(index); + float depth = texelFetch(depthData, coord, 0).x; + uint stencil = texelFetch(stencilData, coord, 0).x; + uint depth_bytes = uint(depth * 0xffffff); + + if (swap_bytes != 0) + { + depth_bytes = (bitfieldExtract(depth_bytes, 0, 8) << 16u) | (bitfieldExtract(depth_bytes, 16, 8) << 0u) | depth_bytes & 0xFF00u; + } + + data[input_coord_to_output_id(coord)] = (depth_bytes << 8) | stencil; + } +} +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyRGBA8ToBuffer.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyRGBA8ToBuffer.glsl new file mode 100644 index 0000000000..1990359a62 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyRGBA8ToBuffer.glsl @@ -0,0 +1,96 @@ +R"( +#version 450 +layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; + +#define IMAGE_LOCATION(x) (x + %loc) +#define SSBO_LOCATION IMAGE_LOCATION(1) + +layout(%set, binding=IMAGE_LOCATION(0)) uniform sampler2D colorData; +layout(%set, binding=SSBO_LOCATION, std430) writeonly restrict buffer OutputBlock +{ + uint data[]; +}; + +#if USE_UBO +layout(%push_block) uniform Configuration +{ + uint swap_bytes; + uint output_pitch; + uint block_width; + uint is_bgra; + ivec2 region_offset; + ivec2 region_size; +}; +#else + uniform uint swap_bytes; + uniform uint output_pitch; + uniform uint block_width; + uniform uint is_bgra; + uniform ivec2 region_offset; + uniform ivec2 region_size; +#endif + +#define KERNEL_SIZE %wks + +uint linear_invocation_id() +{ + uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x); + return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x; +} + +ivec2 linear_id_to_input_coord(uint index) +{ + return ivec2(int(index % region_size.x), int(index / output_pitch)) + region_offset; +} + +uint input_coord_to_output_id(ivec2 coord) +{ + coord -= region_offset; + return coord.y * output_pitch + coord.x; +} + +void main() +{ + uint index = linear_invocation_id() * KERNEL_SIZE; + + for (int loop = 0; loop < KERNEL_SIZE; ++loop, ++index) + { + if (index > uint(region_size.x * region_size.y)) + { + return; + } + + ivec2 coord = linear_id_to_input_coord(index); + vec4 color = texelFetch(colorData, coord, 0); + + if (is_bgra != 0) + { + color = color.bgra; + } + + // Specific to 8-bit color in ARGB8 format. Need to generalize later + if (swap_bytes != 0 && block_width > 1) + { + color = (block_width == 4) ? + color.wzyx : + color.yxwz; + } + + uvec4 bytes = uvec4(color * 255); + uint result; + + if (block_width > 1) + { + // Simulate BE packing as in UINT_8_8_8_8 + result = bytes.w | (bytes.z << 8u) | (bytes.y << 16u) | (bytes.x << 24u); + } + else + { + result = bytes.x | (bytes.y << 8u) | (bytes.z << 16u) | (bytes.w << 24u); + } + + uint output_id = input_coord_to_output_id(coord); + data[output_id] = result; + } +} +)" diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index bacfa3d541..74b1b6ef32 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -817,6 +817,8 @@ + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 3e2ec06d59..24bc4d564b 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -2151,5 +2151,11 @@ Emu\GPU\RSX\Program\Snippets + + Emu\GPU\RSX\Program\Snippets + + + Emu\GPU\RSX\Program\Snippets + \ No newline at end of file