gl: Add support for hardware instancing

This commit is contained in:
kd-11 2025-03-15 22:52:36 +03:00 committed by kd-11
parent 3d3fc2f3cd
commit 65c0d3d425
6 changed files with 133 additions and 52 deletions

View file

@ -599,7 +599,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (!upload_info.index_info)
{
if (draw_call.is_single_draw())
if (draw_call.is_trivial_instanced_draw)
{
glDrawArraysInstanced(draw_mode, 0, upload_info.vertex_draw_count, draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{
glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count);
}
@ -625,7 +629,7 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (driver_caps.vendor_AMD && (first + range.count) > (0x100000 >> 2))
{
//Unlikely, but added here in case the identity buffer is not large enough somehow
// Unlikely, but added here in case the identity buffer is not large enough somehow
use_draw_arrays_fallback = true;
break;
}
@ -635,7 +639,7 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (use_draw_arrays_fallback)
{
//MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more
// MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more
for (u32 n = 0; n < draw_count; ++n)
{
glDrawArrays(draw_mode, firsts[n], counts[n]);
@ -643,13 +647,13 @@ void GLGSRender::emit_geometry(u32 sub_index)
}
else if (driver_caps.vendor_AMD)
{
//Use identity index buffer to fix broken vertexID on AMD
// Use identity index buffer to fix broken vertexID on AMD
m_identity_index_buffer->bind();
glMultiDrawElements(draw_mode, counts, GL_UNSIGNED_INT, offsets, static_cast<GLsizei>(draw_count));
}
else
{
//Normal render
// Normal render
glMultiDrawArrays(draw_mode, firsts, counts, static_cast<GLsizei>(draw_count));
}
}
@ -667,7 +671,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
m_index_ring_buffer->bind();
if (draw_call.is_single_draw())
if (draw_call.is_trivial_instanced_draw)
{
glDrawElementsInstanced(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{ index_offset }), draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{index_offset}));
}
@ -781,13 +789,20 @@ void GLGSRender::end()
m_program->validate();
}
rsx::method_registers.current_draw_clause.begin();
auto& draw_call = REGS(m_ctx)->current_draw_clause;
draw_call.begin();
u32 subdraw = 0u;
do
{
emit_geometry(subdraw++);
if (draw_call.is_trivial_instanced_draw)
{
// We already completed. End the draw.
draw_call.end();
}
}
while (rsx::method_registers.current_draw_clause.next());
while (draw_call.next());
m_rtts.on_write(m_framebuffer_layout.color_write_enabled, m_framebuffer_layout.zeta_write_enabled);

View file

@ -296,6 +296,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
}
else
{
@ -311,6 +312,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::ring_buffer>();
}
m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000);
@ -323,6 +325,7 @@ void GLGSRender::on_init_thread()
m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_raster_env_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_scratch_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_instancing_ring_buffer->create(gl::buffer::target::ssbo, 64 * 0x100000);
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
@ -547,6 +550,11 @@ void GLGSRender::on_exit()
m_scratch_ring_buffer->remove();
}
if (m_instancing_ring_buffer)
{
m_instancing_ring_buffer->remove();
}
m_null_textures.clear();
m_gl_texture_cache.destroy();
m_ui_renderer.destroy();
@ -866,7 +874,8 @@ void GLGSRender::load_program_env()
const bool update_fragment_env = m_graphics_state & rsx::pipeline_state::fragment_state_dirty;
const bool update_fragment_texture_env = m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty;
const bool update_instruction_buffers = !!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program);
const bool update_raster_env = rsx::method_registers.polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty);
const bool update_raster_env = REGS(m_ctx)->polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty);
const bool update_instancing_data = REGS(m_ctx)->current_draw_clause.is_trivial_instanced_draw;
if (manually_flush_ring_buffers)
{
@ -876,6 +885,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);
if (update_instancing_data) m_instancing_ring_buffer->reserve_storage_on_heap(8192 * REGS(m_ctx)->current_draw_clause.pass_count());
if (update_instruction_buffers)
{
@ -899,6 +909,33 @@ void GLGSRender::load_program_env()
m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144);
}
if (update_instancing_data)
{
// Combines transform load + instancing lookup table
const auto alignment = m_min_ssbo_alignment;
u32 indirection_table_offset = 0;
u32 constants_data_table_offset = 0;
rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
indirection_table_offset = mapping.second;
return mapping;
});
rsx::io_buffer constants_array_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
constants_data_table_offset = mapping.second;
return mapping;
});
m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, m_vertex_prog);
m_instancing_ring_buffer->bind_range(GL_INSTANCING_LUT_BIND_SLOT, indirection_table_offset, ::size32(indirection_table_buf));
m_instancing_ring_buffer->bind_range(GL_INSTANCING_XFORM_CONSTANTS_SLOT, constants_data_table_offset, ::size32(constants_array_buf));
}
if (update_transform_constants)
{
// Vertex constants
@ -1011,6 +1048,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->unmap();
if (update_transform_constants) m_transform_constants_buffer->unmap();
if (update_raster_env) m_raster_env_ring_buffer->unmap();
if (update_instancing_data) m_instancing_ring_buffer->unmap();
if (update_instruction_buffers)
{

View file

@ -105,6 +105,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
std::unique_ptr<gl::ring_buffer> m_vertex_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_fragment_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_raster_env_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_instancing_ring_buffer;
// Identity buffer used to fix broken gl_VertexID on ATI stack
std::unique_ptr<gl::buffer> m_identity_index_buffer;
@ -117,6 +118,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
GLint m_min_texbuffer_alignment = 256;
GLint m_uniform_buffer_offset_align = 256;
GLint m_min_ssbo_alignment = 256;
GLint m_max_texbuffer_size = 65536;
bool manually_flush_ring_buffers = false;

View file

@ -183,6 +183,9 @@ OPENGL_PROC(PFNGLUNMAPNAMEDBUFFEREXTPROC, UnmapNamedBufferEXT);
OPENGL_PROC(PFNGLMULTIDRAWELEMENTSPROC, MultiDrawElements);
OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays);
OPENGL_PROC(PFNGLDRAWARRAYSINSTANCEDPROC, DrawArraysInstanced);
OPENGL_PROC(PFNGLDRAWELEMENTSINSTANCEDPROC, DrawElementsInstanced);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEPROC, GetTextureImage);
OPENGL_PROC(PFNGLGETTEXTURESUBIMAGEPROC, GetTextureSubImage);

View file

@ -28,30 +28,31 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
{
OS << "#version 430\n";
OS << "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n";
OS << "{\n";
OS << " mat4 scale_offset_mat;\n";
OS << " ivec4 user_clip_enabled[2];\n";
OS << " vec4 user_clip_factor[2];\n";
OS << " uint transform_branch_bits;\n";
OS << " float point_size;\n";
OS << " float z_near;\n";
OS << " float z_far;\n";
OS << "};\n\n";
OS <<
"#version 430\n"
"layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n"
"{\n"
" mat4 scale_offset_mat;\n"
" ivec4 user_clip_enabled[2];\n"
" vec4 user_clip_factor[2];\n"
" uint transform_branch_bits;\n"
" float point_size;\n"
" float z_near;\n"
" float z_far;\n"
"};\n\n"
OS << "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n";
OS << "{\n";
OS << " uint vertex_base_index;\n";
OS << " uint vertex_index_offset;\n";
OS << " uvec4 input_attributes_blob[16 / 2];\n";
OS << "};\n\n";
"layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n"
"{\n"
" uint vertex_base_index;\n"
" uint vertex_index_offset;\n"
" uvec4 input_attributes_blob[16 / 2];\n"
"};\n\n";
}
void GLVertexDecompilerThread::insertInputs(std::stringstream& OS, const std::vector<ParamType>& /*inputs*/)
{
OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n"; //Data stream with persistent vertex data (cacheable)
OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n"; //Data stream with per-draw data (registers and immediate draw data)
OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n"; // Data stream with persistent vertex data (cacheable)
OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n"; // Data stream with per-draw data (registers and immediate draw data)
}
void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std::vector<ParamType>& constants)
@ -62,10 +63,29 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std:
{
if (PI.name.starts_with("vc["))
{
OS << "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 " << PI.name << ";\n";
OS << "};\n\n";
if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS))
{
OS <<
"layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n"
"{\n"
" vec4 " << PI.name << ";\n"
"};\n\n";
}
else
{
OS <<
"layout(std430, binding = " << GL_INSTANCING_LUT_BIND_SLOT << ") readonly buffer InstancingIndirectionLUT\n"
"{\n"
" int constants_addressing_lookup[];\n"
"};\n\n"
"layout(std430, binding = " << GL_INSTANCING_XFORM_CONSTANTS_SLOT << ") readonly buffer InstancingVertexConstantsBlock\n"
"{\n"
" vec4 instanced_constants_array[];\n"
"};\n\n"
"#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n";
}
continue;
}
@ -104,12 +124,12 @@ static const vertex_reg_info reg_table[] =
{ "gl_Position", false, "dst_reg0", "", false },
{ "diff_color", true, "dst_reg1", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTDIFFUSE | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKDIFFUSE },
{ "spec_color", true, "dst_reg2", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTSPECULAR | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKSPECULAR },
//These are only present when back variants are specified, otherwise the default diff/spec color vars are for both front and back
// These are only present when back variants are specified, otherwise the default diff/spec color vars are for both front and back
{ "diff_color1", true, "dst_reg3", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTDIFFUSE | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKDIFFUSE },
{ "spec_color1", true, "dst_reg4", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTSPECULAR | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKSPECULAR },
//Fog output shares a data source register with clip planes 0-2 so only declare when specified
// Fog output shares a data source register with clip planes 0-2 so only declare when specified
{ "fog_c", true, "dst_reg5", ".xxxx", true, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FOG },
//Warning: Always define all 3 clip plane groups together to avoid flickering with openGL
// Warning: Always define all 3 clip plane groups together to avoid flickering with openGL
{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * user_clip_factor[0].x", false, "user_clip_enabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 },
{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * user_clip_factor[0].y", false, "user_clip_enabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 },
{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * user_clip_factor[0].z", false, "user_clip_enabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
@ -152,6 +172,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
properties2.emulate_depth_clip_only = dev_caps.NV_depth_buffer_float_supported;
properties2.low_precision_tests = dev_caps.vendor_NVIDIA;
properties2.require_explicit_invariance = dev_caps.vendor_MESA || (dev_caps.vendor_NVIDIA && g_cfg.video.shader_precision != gpu_preset_level::low);
properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS);
insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);
@ -188,7 +209,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
OS << "void vs_main()\n";
OS << "{\n";
//Declare temporary registers, ignoring those mapped to outputs
// Declare temporary registers, ignoring those mapped to outputs
for (const ParamType &PT : m_parr.params[PF_PARAM_NONE])
{
for (const ParamItem &PI : PT.items)
@ -237,7 +258,7 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS)
}
else
{
//Insert if-else condition
// Insert if-else condition
OS << " " << i.name << " = " << condition << "? " << i.src_reg << i.src_reg_mask << ": " << i.default_val << ";\n";
}
@ -261,21 +282,21 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS)
OS << " gl_Position = gl_Position * scale_offset_mat;\n";
OS << " gl_Position = apply_zclip_xform(gl_Position, z_near, z_far);\n";
//Since our clip_space is symmetrical [-1, 1] we map it to linear space using the eqn:
//ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer
//RSX matrices passed already map to the [0, 1] range but mapping to classic OGL requires that we undo this step
//This can be made unnecessary using the call glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE).
//However, ClipControl only made it to opengl core in ver 4.5 though, so this is a workaround.
//NOTE: It is completely valid for games to use very large w values, causing the post-multiplied z to be in the hundreds
//It is therefore critical that this step is done post-transform and the result re-scaled by w
//SEE Naruto: UNS
//NOTE: On GPUs, poor fp32 precision means dividing z by w, then multiplying by w again gives slightly incorrect results
//This equation is simplified algebraically to an addition and subtraction which gives more accurate results (Fixes flickering skybox in Dark Souls 2)
//OS << " float ndc_z = gl_Position.z / gl_Position.w;\n";
//OS << " ndc_z = (ndc_z * 2.) - 1.;\n";
//OS << " gl_Position.z = ndc_z * gl_Position.w;\n";
// Since our clip_space is symmetrical [-1, 1] we map it to linear space using the eqn:
// ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer
// RSX matrices passed already map to the [0, 1] range but mapping to classic OGL requires that we undo this step
// This can be made unnecessary using the call glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE).
// However, ClipControl only made it to opengl core in ver 4.5 though, so this is a workaround.
// NOTE: It is completely valid for games to use very large w values, causing the post-multiplied z to be in the hundreds
// It is therefore critical that this step is done post-transform and the result re-scaled by w
// SEE Naruto: UNS
// NOTE: On GPUs, poor fp32 precision means dividing z by w, then multiplying by w again gives slightly incorrect results
// This equation is simplified algebraically to an addition and subtraction which gives more accurate results (Fixes flickering skybox in Dark Souls 2)
// OS << " float ndc_z = gl_Position.z / gl_Position.w;\n";
// OS << " ndc_z = (ndc_z * 2.) - 1.;\n";
// OS << " gl_Position.z = ndc_z * gl_Position.w;\n";
OS << " gl_Position.z = (gl_Position.z + gl_Position.z) - gl_Position.w;\n";
OS << "}\n";
}

View file

@ -20,6 +20,8 @@
#define GL_RASTERIZER_STATE_BIND_SLOT UBO_SLOT(6)
#define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0)
#define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1)
#define GL_INSTANCING_LUT_BIND_SLOT SSBO_SLOT(2)
#define GL_INSTANCING_XFORM_CONSTANTS_SLOT SSBO_SLOT(3)
#define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index)
#define GL_COMPUTE_IMAGE_SLOT(index) SSBO_SLOT(index)