gl: Use compute shaders for typeless texture decode

This commit is contained in:
kd-11 2019-10-02 03:47:19 +03:00 committed by kd-11
parent 7a6e2e716f
commit 105d4b51e6
7 changed files with 218 additions and 186 deletions

View file

@ -1,6 +1,7 @@
#pragma once
#include "Utilities/StrUtil.h"
#include "Emu/IdManager.h"
#include "GLHelpers.h"
namespace gl
@ -16,6 +17,28 @@ namespace gl
bool unroll_loops = true;
u32 optimal_group_size = 1;
u32 optimal_kernel_size = 1;
u32 max_invocations_x = 65535;
void initialize()
{
// Set up optimal kernel size
const auto& caps = gl::get_driver_caps();
if (caps.vendor_AMD || caps.vendor_MESA)
{
optimal_group_size = 64;
unroll_loops = false;
}
else if (caps.vendor_NVIDIA)
{
optimal_group_size = 32;
}
else
{
optimal_group_size = 128;
}
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, (GLint*)&max_invocations_x);
}
void create()
{
@ -52,6 +75,7 @@ namespace gl
GLint old_program;
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
bind_resources();
m_program.use();
glDispatchCompute(invocations_x, invocations_y, 1);
@ -60,7 +84,23 @@ namespace gl
void run(u32 num_invocations)
{
run(num_invocations, 1);
u32 invocations_x, invocations_y;
if (LIKELY(num_invocations <= max_invocations_x))
{
invocations_x = num_invocations;
invocations_y = 1;
}
else
{
// Since all the invocations will run, the optimal distribution is sqrt(count)
const auto optimal_length = (u32)floor(std::sqrt(num_invocations));
invocations_x = optimal_length;
invocations_y = invocations_x;
if (num_invocations % invocations_x) invocations_y++;
}
run(invocations_x, invocations_y);
}
};
@ -89,7 +129,7 @@ namespace gl
void build(const char* function_name, u32 _kernel_size = 0)
{
// Initialize to allow detecting optimal settings
create();
initialize();
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
@ -107,15 +147,21 @@ namespace gl
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
"\n"
"// Depth format conversions\n"
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
"#define d24x8_to_x8d24(bits) (bits << 8) | (bits >> 24)\n"
"#define d24x8_to_x8d24_swapped(bits) bswap_u32(d24x8_to_x8d24(bits))\n"
"#define x8d24_to_d24x8(bits) (bits >> 8) | (bits << 24)\n"
"#define x8d24_to_d24x8_swapped(bits) x8d24_to_d24x8(bswap_u32(bits))\n"
"\n"
"uint linear_invocation_id()\n"
"{\n"
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
"}\n"
"\n"
"void main()\n"
"{\n"
" uint index = gl_GlobalInvocationID.x * KERNEL_SIZE;\n"
" uint invocation_id = linear_invocation_id();\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
" %vars"
"\n";
@ -169,7 +215,7 @@ namespace gl
void bind_resources() override
{
m_data->bind_range(GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
}
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
@ -220,156 +266,35 @@ namespace gl
}
};
struct cs_shuffle_d24x8_f32 : cs_shuffle_base
template<bool _SwapBytes = false>
struct cs_shuffle_d24x8_to_x8d24 : cs_shuffle_base
{
// convert d24x8 to f32
cs_shuffle_d24x8_f32()
cs_shuffle_d24x8_to_x8d24()
{
cs_shuffle_base::build("d24x8_to_f32");
if constexpr (_SwapBytes)
{
cs_shuffle_base::build("d24x8_to_x8d24_swapped");
}
else
{
cs_shuffle_base::build("d24x8_to_x8d24");
}
}
};
struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
template<bool _SwapBytes = false>
struct cs_shuffle_x8d24_to_d24x8 : cs_shuffle_base
{
// convert f32 to d24x8 and swap endianness
cs_shuffle_se_f32_d24x8()
cs_shuffle_x8d24_to_d24x8()
{
cs_shuffle_base::build("f32_to_d24x8_swapped");
}
};
struct cs_shuffle_se_d24x8 : cs_shuffle_base
{
// swap endianness of d24x8
cs_shuffle_se_d24x8()
{
cs_shuffle_base::build("d24x8_to_d24x8_swapped");
}
};
// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
struct cs_interleave_task : cs_shuffle_base
{
cs_interleave_task()
{
uniforms =
" uniform uint block_length;\n"
" uniform uint z_offset;\n"
" uniform uint s_offset;\n";
variables =
" uint depth;\n"
" uint stencil;\n"
" uint stencil_shift;\n"
" uint stencil_offset;\n";
}
void run(const gl::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
{
m_program.uniforms["block_length"] = data_length;
m_program.uniforms["z_offset"] = zeta_offset - data_offset;
m_program.uniforms["s_offset"] = stencil_offset - data_offset;
cs_shuffle_base::run(data, data_length, data_offset);
}
};
template<bool _SwapBytes = false>
struct cs_gather_d24x8 : cs_interleave_task
{
cs_gather_d24x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" depth = data[index + z_offset] & 0x00FFFFFF;\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = data[stencil_offset + s_offset];\n"
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
" value = (depth << 8) | stencil;\n";
if constexpr (!_SwapBytes)
{
work_kernel +=
" data[index] = value;\n";
}
else
{
work_kernel +=
" data[index] = bswap_u32(value);\n";
}
cs_shuffle_base::build("");
}
};
template<bool _SwapBytes = false>
struct cs_gather_d32x8 : cs_interleave_task
{
cs_gather_d32x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" depth = f32_to_d24(data[index + z_offset]);\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = data[stencil_offset + s_offset];\n"
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
" value = (depth << 8) | stencil;\n";
if constexpr (!_SwapBytes)
{
work_kernel +=
" data[index] = value;\n";
}
else
{
work_kernel +=
" data[index] = bswap_u32(value);\n";
}
cs_shuffle_base::build("");
}
};
struct cs_scatter_d24x8 : cs_interleave_task
{
cs_scatter_d24x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" value = data[index];\n"
" data[index + z_offset] = (value >> 8);\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = (value & 0xFF) << stencil_shift;\n"
" data[stencil_offset + s_offset] |= stencil;\n";
cs_shuffle_base::build("");
}
};
struct cs_scatter_d32x8 : cs_interleave_task
{
cs_scatter_d32x8()
{
work_kernel =
" if (index >= block_length)\n"
" return;\n"
"\n"
" value = data[index];\n"
" data[index + z_offset] = d24_to_f32(value >> 8);\n"
" stencil_offset = (index / 4);\n"
" stencil_shift = (index % 4) * 8;\n"
" stencil = (value & 0xFF) << stencil_shift;\n"
" data[stencil_offset + s_offset] |= stencil;\n";
cs_shuffle_base::build("");
if constexpr (_SwapBytes)
{
cs_shuffle_base::build("x8d24_to_d24x8_swapped");
}
else
{
cs_shuffle_base::build("x8d24_to_d24x8");
}
}
};
@ -390,4 +315,6 @@ namespace gl
return static_cast<T*>(e.get());
}
void destroy_compute_tasks();
}

View file

@ -2,6 +2,7 @@
#include "Emu/Memory/vm.h"
#include "Emu/System.h"
#include "GLGSRender.h"
#include "GLCompute.h"
#include "GLVertexProgram.h"
#include "../rsx_methods.h"
#include "../Common/BufferUtils.h"
@ -965,6 +966,8 @@ void GLGSRender::on_init_thread()
void GLGSRender::on_exit()
{
gl::destroy_compute_tasks();
zcull_ctrl.release();
m_prog_buffer.clear();

View file

@ -30,6 +30,16 @@ namespace gl
}
}
void destroy_compute_tasks()
{
for (auto& [key, prog] : g_compute_tasks)
{
prog->destroy();
}
g_compute_tasks.clear();
}
#ifdef WIN32
void APIENTRY dbgFunc(GLenum source, GLenum type, GLuint id,
GLenum severity, GLsizei lenght, const GLchar* message,

View file

@ -2454,8 +2454,8 @@ public:
case type::fragment:
base_name = "shaderlog/FragmentProgram";
break;
case type::geometry:
base_name = "shaderlog/GeometryProgram";
case type::compute:
base_name = "shaderlog/ComputeProgram";
break;
}

View file

@ -1,5 +1,6 @@
#include "stdafx.h"
#include "GLTexture.h"
#include "GLCompute.h"
#include "../GCM.h"
#include "../RSXThread.h"
#include "../RSXTexture.h"
@ -90,43 +91,43 @@ namespace gl
fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format);
}
std::tuple<GLenum, GLenum, bool> get_format_type(texture::internal_format format)
pixel_buffer_layout get_format_type(texture::internal_format format)
{
switch (format)
{
case texture::internal_format::compressed_rgba_s3tc_dxt1:
case texture::internal_format::compressed_rgba_s3tc_dxt3:
case texture::internal_format::compressed_rgba_s3tc_dxt5:
return std::make_tuple(GL_RGBA, GL_UNSIGNED_BYTE, false);
return { GL_RGBA, GL_UNSIGNED_BYTE, 1, false };
case texture::internal_format::r8:
return std::make_tuple(GL_RED, GL_UNSIGNED_BYTE, false);
return { GL_RED, GL_UNSIGNED_BYTE, 1, false };
case texture::internal_format::r16:
return std::make_tuple(GL_RED, GL_UNSIGNED_SHORT, true);
return { GL_RED, GL_UNSIGNED_SHORT, 2, true };
case texture::internal_format::r32f:
return std::make_tuple(GL_RED, GL_FLOAT, true);
return { GL_RED, GL_FLOAT, 4, true };
case texture::internal_format::rg8:
return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE, false);
return { GL_RG, GL_UNSIGNED_BYTE, 1, false };
case texture::internal_format::rg16:
return std::make_tuple(GL_RG, GL_UNSIGNED_SHORT, true);
return { GL_RG, GL_UNSIGNED_SHORT, 2, true };
case texture::internal_format::rg16f:
return std::make_tuple(GL_RG, GL_HALF_FLOAT, true);
return { GL_RG, GL_HALF_FLOAT, 2, true };
case texture::internal_format::rgb565:
return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5, true);
return { GL_RGB, GL_UNSIGNED_SHORT_5_6_5, 2, true };
case texture::internal_format::rgb5a1:
return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, true);
return { GL_RGB, GL_UNSIGNED_SHORT_5_5_5_1, 2, true };
case texture::internal_format::rgba4:
return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, false);
return { GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4, 2, false };
case texture::internal_format::rgba8:
return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, false);
return { GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 4, false };
case texture::internal_format::rgba16f:
return std::make_tuple(GL_RGBA, GL_HALF_FLOAT, true);
return { GL_RGBA, GL_HALF_FLOAT, 2, true };
case texture::internal_format::rgba32f:
return std::make_tuple(GL_RGBA, GL_FLOAT, true);
return { GL_RGBA, GL_FLOAT, 4, true };
case texture::internal_format::depth16:
return std::make_tuple(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, true);
return { GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, 2, true };
case texture::internal_format::depth24_stencil8:
case texture::internal_format::depth32f_stencil8:
return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, true);
return { GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 4, true };
default:
fmt::throw_exception("Unexpected internal format 0x%X" HERE, (u32)format);
}
@ -742,30 +743,113 @@ namespace gl
GLsizeiptr src_mem = src->width() * src->height();
GLsizeiptr dst_mem = dst->width() * dst->height();
GLenum buffer_copy_flag = GL_STATIC_COPY;
if (gl::get_driver_caps().vendor_MESA) buffer_copy_flag = GL_STREAM_COPY;
// NOTE: Mesa lacks acceleration for PBO unpacking and is currently fastest with GL_STREAM_COPY
// See https://bugs.freedesktop.org/show_bug.cgi?id=111043
auto max_mem = std::max(src_mem, dst_mem) * 16;
if (!g_typeless_transfer_buffer || max_mem > g_typeless_transfer_buffer.size())
{
if (g_typeless_transfer_buffer) g_typeless_transfer_buffer.remove();
g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, buffer_copy_flag);
g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
}
auto format_type = get_format_type(src->get_internal_format());
const auto pack_info = get_format_type(src->get_internal_format());
const auto unpack_info = get_format_type(dst->get_internal_format());
pixel_pack_settings pack_settings{};
pack_settings.swap_bytes(std::get<2>(format_type));
g_typeless_transfer_buffer.bind(buffer::target::pixel_pack);
src->copy_to(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), pack_settings);
src->copy_to(nullptr, (texture::format)pack_info.format, (texture::type)pack_info.type, pack_settings);
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
format_type = get_format_type(dst->get_internal_format());
const bool src_is_ds = !!(src->aspect() & gl::image_aspect::stencil);
const bool dst_is_ds = !!(src->aspect() & gl::image_aspect::stencil);
if (pack_info.swap_bytes || unpack_info.swap_bytes || src_is_ds || dst_is_ds)
{
gl::cs_shuffle_base *src_transform = nullptr, *dst_transform = nullptr;
if (src_is_ds)
{
if (pack_info.swap_bytes)
{
src_transform = gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<true>>();
}
else
{
src_transform = gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<false>>();
}
}
else if (pack_info.swap_bytes)
{
switch (pack_info.size)
{
case 1:
break;
case 2:
src_transform = gl::get_compute_task<gl::cs_shuffle_16>();
break;
case 4:
src_transform = gl::get_compute_task<gl::cs_shuffle_32>();
break;
default:
fmt::throw_exception("Unsupported format");
}
}
if (dst_is_ds)
{
if (unpack_info.swap_bytes)
{
dst_transform = gl::get_compute_task<gl::cs_shuffle_x8d24_to_d24x8<true>>();
}
else
{
dst_transform = gl::get_compute_task<gl::cs_shuffle_x8d24_to_d24x8<false>>();
}
}
else if (unpack_info.swap_bytes)
{
switch (unpack_info.size)
{
case 1:
break;
case 2:
dst_transform = gl::get_compute_task<gl::cs_shuffle_16>();
break;
case 4:
dst_transform = gl::get_compute_task<gl::cs_shuffle_32>();
break;
default:
fmt::throw_exception("Unsupported format");
}
if (!src_is_ds)
{
if (src_transform == dst_transform)
{
src_transform = dst_transform = nullptr;
}
else if (src_transform)
{
src_transform = gl::get_compute_task<gl::cs_shuffle_32_16>();
dst_transform = nullptr;
}
}
if (src_transform)
{
const auto image_size = src->pitch() * src->height();
src_transform->run(&g_typeless_transfer_buffer, image_size);
}
if (dst_transform)
{
const auto image_size = dst->pitch() * dst->height();
dst_transform->run(&g_typeless_transfer_buffer, image_size);
}
}
}
pixel_unpack_settings unpack_settings{};
unpack_settings.swap_bytes(std::get<2>(format_type));
g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack);
dst->copy_from(nullptr, (texture::format)std::get<0>(format_type), (texture::type)std::get<1>(format_type), unpack_settings);
dst->copy_from(nullptr, (texture::format)unpack_info.format, (texture::type)unpack_info.type, unpack_settings);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE);
}
}

View file

@ -13,10 +13,18 @@ namespace rsx
namespace gl
{
struct pixel_buffer_layout
{
GLenum format;
GLenum type;
u8 size;
bool swap_bytes;
};
GLenum get_target(rsx::texture_dimension_extended type);
GLenum get_sized_internal_format(u32 texture_format);
std::tuple<GLenum, GLenum> get_format_type(u32 texture_format);
std::tuple<GLenum, GLenum, bool> get_format_type(texture::internal_format format);
pixel_buffer_layout get_format_type(texture::internal_format format);
GLenum wrap_mode(rsx::texture_wrap_mode wrap);
float max_aniso(rsx::texture_max_anisotropy aniso);
std::array<GLenum, 4> get_swizzle_remap(u32 texture_format);

View file

@ -163,12 +163,12 @@ namespace gl
{
// Determine unpack config dynamically
const auto format_info = gl::get_format_type(src->get_internal_format());
format = static_cast<gl::texture::format>(std::get<0>(format_info));
type = static_cast<gl::texture::type>(std::get<1>(format_info));
format = static_cast<gl::texture::format>(format_info.format);
type = static_cast<gl::texture::type>(format_info.type);
if ((src->aspect() & gl::image_aspect::stencil) == 0)
{
pack_unpack_swap_bytes = std::get<2>(format_info);
pack_unpack_swap_bytes = format_info.swap_bytes;
}
else
{