gl: Move vertex processing to the GPU

- Significant gains from greatly reduced CPU work
- Also reorders command submission in end() to improve throughput

- Refactors most of the vertex buffer handling
- All vertex processing is moved GPU side
This commit is contained in:
kd-11 2017-07-31 14:38:28 +03:00
parent e668ecd453
commit d54c2dd39a
21 changed files with 1025 additions and 1149 deletions

View file

@ -28,7 +28,7 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
return false;
if (binary1.data.size() != binary2.data.size())
return false;
if (binary1.rsx_vertex_inputs != binary2.rsx_vertex_inputs)
if (!binary1.skip_vertex_input_check && binary1.rsx_vertex_inputs != binary2.rsx_vertex_inputs)
return false;
const qword *instBuffer1 = (const qword*)binary1.data.data();

View file

@ -89,7 +89,7 @@ void D3D12GSRender::upload_and_bind_scale_offset_matrix(size_t descriptorIndex)
void *mapped_buffer = m_buffer_data.map<void>(CD3DX12_RANGE(heap_offset, heap_offset + 512));
fill_scale_offset_data(mapped_buffer, true);
fill_user_clip_data((char*)mapped_buffer + 64);
fill_fragment_state_buffer((char *)mapped_buffer + 128, m_fragment_program);
fill_fragment_state_buffer((char *)mapped_buffer + 128, current_fragment_program);
m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + 512));
D3D12_CONSTANT_BUFFER_VIEW_DESC constant_buffer_view_desc = {
@ -124,7 +124,7 @@ void D3D12GSRender::upload_and_bind_vertex_shader_constants(size_t descriptor_in
D3D12_CONSTANT_BUFFER_VIEW_DESC D3D12GSRender::upload_fragment_shader_constants()
{
// Get constant from fragment program
size_t buffer_size = m_pso_cache.get_fragment_constants_buffer_size(m_fragment_program);
size_t buffer_size = m_pso_cache.get_fragment_constants_buffer_size(current_fragment_program);
// Multiple of 256 never 0
buffer_size = (buffer_size + 255) & ~255;
@ -132,7 +132,7 @@ D3D12_CONSTANT_BUFFER_VIEW_DESC D3D12GSRender::upload_fragment_shader_constants(
size_t offset = 0;
float *mapped_buffer = m_buffer_data.map<float>(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
m_pso_cache.fill_fragment_constants_buffer({ mapped_buffer, ::narrow<int>(buffer_size) }, m_fragment_program);
m_pso_cache.fill_fragment_constants_buffer({ mapped_buffer, ::narrow<int>(buffer_size) }, current_fragment_program);
m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
return {

View file

@ -329,7 +329,7 @@ void D3D12GSRender::end()
std::chrono::time_point<steady_clock> program_load_end = steady_clock::now();
m_timers.program_load_duration += std::chrono::duration_cast<std::chrono::microseconds>(program_load_end - program_load_start).count();
if (!m_fragment_program.valid)
if (!current_fragment_program.valid)
{
rsx::thread::end();
return;

View file

@ -61,8 +61,6 @@ private:
data_cache m_texture_cache;
bool invalidate_address(u32 addr);
RSXVertexProgram m_vertex_program;
RSXFragmentProgram m_fragment_program;
PipelineStateObjectCache m_pso_cache;
std::tuple<ComPtr<ID3D12PipelineState>, size_t, size_t> m_current_pso;

View file

@ -53,10 +53,10 @@ void D3D12GSRender::load_program()
return std::make_tuple(true, native_pitch);
};
m_vertex_program = get_current_vertex_program();
m_fragment_program = get_current_fragment_program(rtt_lookup_func);
get_current_vertex_program();
get_current_fragment_program(rtt_lookup_func);
if (!m_fragment_program.valid)
if (!current_fragment_program.valid)
return;
D3D12PipelineProperties prop = {};
@ -308,12 +308,12 @@ void D3D12GSRender::load_program()
}
}
m_current_pso = m_pso_cache.getGraphicPipelineState(m_vertex_program, m_fragment_program, prop, m_device.Get(), m_shared_root_signature.Get());
m_current_pso = m_pso_cache.getGraphicPipelineState(current_vertex_program, current_fragment_program, prop, m_device.Get(), m_shared_root_signature.Get());
return;
}
std::pair<std::string, std::string> D3D12GSRender::get_programs() const
{
return std::make_pair(m_pso_cache.get_transform_program(m_vertex_program).content, m_pso_cache.get_shader_program(m_fragment_program).content);
return std::make_pair(m_pso_cache.get_transform_program(current_vertex_program).content, m_pso_cache.get_shader_program(current_fragment_program).content);
}
#endif

View file

@ -306,18 +306,12 @@ namespace
void GLGSRender::end()
{
std::chrono::time_point<steady_clock> program_start = steady_clock::now();
//Load program here since it is dependent on vertex state
if (skip_frame || !framebuffer_status_valid || (conditional_render_enabled && conditional_render_test_failed) || !load_program())
if (skip_frame || !framebuffer_status_valid || (conditional_render_enabled && conditional_render_test_failed) || !check_program_state())
{
rsx::thread::end();
return;
}
std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
if (manually_flush_ring_buffers)
{
//Use approximations to reseve space. This path is mostly for debug purposes anyway
@ -329,6 +323,32 @@ void GLGSRender::end()
m_index_ring_buffer->reserve_storage_on_heap(16 * 1024);
}
//Do vertex upload before RTT prep / texture lookups to give the driver time to push data
u32 vertex_draw_count;
u32 actual_vertex_count;
u32 vertex_base;
std::optional<std::tuple<GLenum, u32> > indexed_draw_info;
std::tie(vertex_draw_count, actual_vertex_count, vertex_base, indexed_draw_info) = set_vertex_buffer();
std::chrono::time_point<steady_clock> program_start = steady_clock::now();
//Load program here since it is dependent on vertex state
load_program(vertex_base, actual_vertex_count);
std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
if (manually_flush_ring_buffers)
{
m_attrib_ring_buffer->unmap();
m_index_ring_buffer->unmap();
}
else
{
//DMA push; not needed with MAP_COHERENT
//glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
}
//Check if depth buffer is bound and valid
//If ds is not initialized clear it; it seems new depth textures should have depth cleared
auto copy_rtt_contents = [](gl::render_target *surface)
@ -470,20 +490,6 @@ void GLGSRender::end()
std::chrono::time_point<steady_clock> textures_end = steady_clock::now();
m_textures_upload_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(textures_end - textures_start).count();
u32 vertex_draw_count = m_last_vertex_count;
std::optional<std::tuple<GLenum, u32> > indexed_draw_info;
bool skip_upload = false;
if (!is_probable_instanced_draw())
{
std::tie(vertex_draw_count, indexed_draw_info) = set_vertex_buffer();
m_last_vertex_count = vertex_draw_count;
}
else
{
skip_upload = true;
}
std::chrono::time_point<steady_clock> draw_start = steady_clock::now();
if (g_cfg.video.debug_output)
@ -491,45 +497,31 @@ void GLGSRender::end()
m_program->validate();
}
if (manually_flush_ring_buffers)
if (indexed_draw_info)
{
m_attrib_ring_buffer->unmap();
m_index_ring_buffer->unmap();
}
const GLenum index_type = std::get<0>(indexed_draw_info.value());
const u32 index_offset = std::get<1>(indexed_draw_info.value());
if (indexed_draw_info || (skip_upload && m_last_draw_indexed == true))
{
if (__glcheck gl_state.enable(rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART))
{
GLenum index_type = (skip_upload)? m_last_ib_type: std::get<0>(indexed_draw_info.value());
__glcheck glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT)? 0xffff: 0xffffffff);
}
m_last_draw_indexed = true;
if (!skip_upload)
{
m_last_ib_type = std::get<0>(indexed_draw_info.value());
m_last_index_offset = std::get<1>(indexed_draw_info.value());
}
__glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, m_last_ib_type, (GLvoid *)(std::ptrdiff_t)m_last_index_offset);
__glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
}
else
{
draw_fbo.draw_arrays(rsx::method_registers.current_draw_clause.primitive, vertex_draw_count);
m_last_draw_indexed = false;
glDrawArrays(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), 0, vertex_draw_count);
}
m_attrib_ring_buffer->notify();
m_index_ring_buffer->notify();
m_scale_offset_buffer->notify();
m_vertex_state_buffer->notify();
m_fragment_constants_buffer->notify();
m_transform_constants_buffer->notify();
std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
m_draw_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(draw_end - draw_start).count();
m_draw_calls++;
if (zcull_task_queue.active_query &&
@ -537,7 +529,6 @@ void GLGSRender::end()
zcull_task_queue.active_query->num_draws++;
synchronize_buffers();
rsx::thread::end();
}
@ -620,13 +611,21 @@ void GLGSRender::on_init_thread()
const u32 texture_index_offset = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;
for (int index = 0; index < rsx::limits::vertex_count; ++index)
//Array stream buffer
{
auto &tex = m_gl_attrib_buffers[index];
auto &tex = m_gl_persistent_stream_buffer;
tex.create();
tex.set_target(gl::texture::target::textureBuffer);
glActiveTexture(GL_TEXTURE0 + texture_index_offset);
tex.bind();
}
glActiveTexture(GL_TEXTURE0 + texture_index_offset + index);
//Register stream buffer
{
auto &tex = m_gl_volatile_stream_buffer;
tex.create();
tex.set_target(gl::texture::target::textureBuffer);
glActiveTexture(GL_TEXTURE0 + texture_index_offset + 1);
tex.bind();
}
@ -645,7 +644,7 @@ void GLGSRender::on_init_thread()
m_attrib_ring_buffer.reset(new gl::legacy_ring_buffer());
m_transform_constants_buffer.reset(new gl::legacy_ring_buffer());
m_fragment_constants_buffer.reset(new gl::legacy_ring_buffer());
m_scale_offset_buffer.reset(new gl::legacy_ring_buffer());
m_vertex_state_buffer.reset(new gl::legacy_ring_buffer());
m_index_ring_buffer.reset(new gl::legacy_ring_buffer());
}
else
@ -653,7 +652,7 @@ void GLGSRender::on_init_thread()
m_attrib_ring_buffer.reset(new gl::ring_buffer());
m_transform_constants_buffer.reset(new gl::ring_buffer());
m_fragment_constants_buffer.reset(new gl::ring_buffer());
m_scale_offset_buffer.reset(new gl::ring_buffer());
m_vertex_state_buffer.reset(new gl::ring_buffer());
m_index_ring_buffer.reset(new gl::ring_buffer());
}
@ -661,7 +660,7 @@ void GLGSRender::on_init_thread()
m_index_ring_buffer->create(gl::buffer::target::element_array, 64 * 0x100000);
m_transform_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_fragment_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_scale_offset_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_vertex_state_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_vao.element_array_buffer = *m_index_ring_buffer;
@ -704,7 +703,7 @@ void GLGSRender::on_init_thread()
void GLGSRender::on_exit()
{
glDisable(GL_VERTEX_PROGRAM_POINT_SIZE);
glFinish();
m_prog_buffer.clear();
@ -728,10 +727,8 @@ void GLGSRender::on_exit()
m_vao.remove();
}
for (gl::texture &tex : m_gl_attrib_buffers)
{
tex.remove();
}
m_gl_persistent_stream_buffer.remove();
m_gl_volatile_stream_buffer.remove();
for (auto &sampler : m_gl_sampler_states)
{
@ -753,9 +750,9 @@ void GLGSRender::on_exit()
m_fragment_constants_buffer->remove();
}
if (m_scale_offset_buffer)
if (m_vertex_state_buffer)
{
m_scale_offset_buffer->remove();
m_vertex_state_buffer->remove();
}
if (m_index_ring_buffer)
@ -865,7 +862,7 @@ bool GLGSRender::do_method(u32 cmd, u32 arg)
return false;
}
bool GLGSRender::load_program()
bool GLGSRender::check_program_state()
{
auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture &tex, bool is_depth) -> std::tuple<bool, u16>
{
@ -887,12 +884,19 @@ bool GLGSRender::load_program()
return std::make_tuple(true, surface->get_native_pitch());
};
RSXFragmentProgram fragment_program = get_current_fragment_program(rtt_lookup_func);
if (!fragment_program.valid) return false;
get_current_fragment_program(rtt_lookup_func);
RSXVertexProgram vertex_program = get_current_vertex_program();
if (current_fragment_program.valid == false)
return false;
u32 unnormalized_rtts = 0;
get_current_vertex_program();
return true;
}
void GLGSRender::load_program(u32 vertex_base, u32 vertex_count)
{
auto &fragment_program = current_fragment_program;
auto &vertex_program = current_vertex_program;
for (auto &vtx : vertex_program.rsx_vertex_inputs)
{
@ -906,12 +910,13 @@ bool GLGSRender::load_program()
}
}
vertex_program.skip_vertex_input_check = true; //not needed for us since decoding is done server side
auto old_program = m_program;
m_program = &m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, nullptr);
m_program->use();
u8 *buf;
u32 scale_offset_offset;
u32 vertex_state_offset;
u32 vertex_constants_offset;
u32 fragment_constants_offset;
@ -920,17 +925,20 @@ bool GLGSRender::load_program()
if (manually_flush_ring_buffers)
{
m_scale_offset_buffer->reserve_storage_on_heap(512);
m_vertex_state_buffer->reserve_storage_on_heap(512);
m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_buffer_size, 256));
if (m_transform_constants_dirty) m_transform_constants_buffer->reserve_storage_on_heap(8192);
}
// Scale offset
auto mapping = m_scale_offset_buffer->alloc_from_heap(512, m_uniform_buffer_offset_align);
// Vertex state
auto mapping = m_vertex_state_buffer->alloc_from_heap(512, m_uniform_buffer_offset_align);
buf = static_cast<u8*>(mapping.first);
scale_offset_offset = mapping.second;
vertex_state_offset = mapping.second;
fill_scale_offset_data(buf, false);
fill_user_clip_data((char *)buf + 64);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 144));
if (m_transform_constants_dirty)
{
@ -939,7 +947,6 @@ bool GLGSRender::load_program()
buf = static_cast<u8*>(mapping.first);
vertex_constants_offset = mapping.second;
fill_vertex_program_constants_data(buf);
*(reinterpret_cast<u32*>(buf + (468 * 4 * sizeof(float)))) = rsx::method_registers.transform_branch_bits();
}
// Fragment constants
@ -952,21 +959,20 @@ bool GLGSRender::load_program()
// Fragment state
fill_fragment_state_buffer(buf+fragment_constants_size, fragment_program);
m_scale_offset_buffer->bind_range(0, scale_offset_offset, 512);
m_vertex_state_buffer->bind_range(0, vertex_state_offset, 512);
m_fragment_constants_buffer->bind_range(2, fragment_constants_offset, fragment_buffer_size);
if (m_transform_constants_dirty) m_transform_constants_buffer->bind_range(1, vertex_constants_offset, 8192);
if (manually_flush_ring_buffers)
{
m_scale_offset_buffer->unmap();
m_vertex_state_buffer->unmap();
m_fragment_constants_buffer->unmap();
if (m_transform_constants_dirty) m_transform_constants_buffer->unmap();
}
m_transform_constants_dirty = false;
return true;
}
void GLGSRender::flip(int buffer)

View file

@ -50,12 +50,13 @@ private:
gl::texture_cache m_gl_texture_cache;
gl::texture m_gl_attrib_buffers[rsx::limits::vertex_count];
gl::texture m_gl_persistent_stream_buffer;
gl::texture m_gl_volatile_stream_buffer;
std::unique_ptr<gl::ring_buffer> m_attrib_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_fragment_constants_buffer;
std::unique_ptr<gl::ring_buffer> m_transform_constants_buffer;
std::unique_ptr<gl::ring_buffer> m_scale_offset_buffer;
std::unique_ptr<gl::ring_buffer> m_vertex_state_buffer;
std::unique_ptr<gl::ring_buffer> m_index_ring_buffer;
u32 m_draw_calls = 0;
@ -83,11 +84,6 @@ private:
bool flush_draw_buffers = false;
bool m_last_draw_indexed;
GLenum m_last_ib_type;
size_t m_last_index_offset;
u32 m_last_vertex_count;
public:
gl::fbo draw_fbo;
@ -391,13 +387,16 @@ private:
gl_state;
// Return element to draw and in case of indexed draw index type and offset in index buffer
std::tuple<u32, std::optional<std::tuple<GLenum, u32> > > set_vertex_buffer();
std::tuple<u32, u32, u32, std::optional<std::tuple<GLenum, u32> > > set_vertex_buffer();
rsx::vertex_input_layout m_vertex_layout = {};
void clear_surface(u32 arg);
void init_buffers(bool skip_reading = false);
bool check_program_state();
void load_program(u32 vertex_base, u32 vertex_count);
public:
bool load_program();
void init_buffers(bool skip_reading = false);
void read_buffers();
void write_buffers();
void set_viewport();

View file

@ -154,8 +154,17 @@ namespace gl
}
//Workaround for intel drivers which have terrible capability reporting
std::string vendor_string = (const char*)glGetString(GL_VENDOR);
std::transform(vendor_string.begin(), vendor_string.end(), vendor_string.begin(), ::tolower);
std::string vendor_string;
if (const char* raw_string = (const char*)glGetString(GL_VENDOR))
{
vendor_string = raw_string;
std::transform(vendor_string.begin(), vendor_string.end(), vendor_string.begin(), ::tolower);
}
else
{
LOG_ERROR(RSX, "Failed to get vendor string from driver. Are we missing a context?");
vendor_string = "intel"; //lowest acceptable value
}
if (vendor_string.find("intel") != std::string::npos)
{
@ -826,7 +835,7 @@ namespace gl
buffer::create();
glBindBuffer((GLenum)m_target, m_id);
glBufferStorage((GLenum)m_target, size, data, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
glBufferStorage((GLenum)m_target, size, data, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_CLIENT_STORAGE_BIT | GL_MAP_COHERENT_BIT);
m_memory_mapping = glMapBufferRange((GLenum)m_target, 0, size, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
verify(HERE), m_memory_mapping != nullptr;
@ -2538,7 +2547,7 @@ namespace gl
error_msg = buf.get();
}
throw validation_exception(error_msg);
LOG_ERROR(RSX, "Validation failed: %s", error_msg.c_str());
}
}

View file

@ -54,24 +54,11 @@ struct GLTraits
result.uniforms[location] = (i + rsx::limits::fragment_textures_count);
}
//We use texture buffers for vertex attributes. Bind these here as well
//as they are guaranteed to be fixed (1 to 1 mapping)
std::array<const char*, 16> s_reg_table =
{
"in_pos_buffer", "in_weight_buffer", "in_normal_buffer",
"in_diff_color_buffer", "in_spec_color_buffer",
"in_fog_buffer",
"in_point_size_buffer", "in_7_buffer",
"in_tc0_buffer", "in_tc1_buffer", "in_tc2_buffer", "in_tc3_buffer",
"in_tc4_buffer", "in_tc5_buffer", "in_tc6_buffer", "in_tc7_buffer"
};
const int stream_buffer_start = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;
for (int i = 0; i < rsx::limits::vertex_count; ++i)
{
int location;
if (result.uniforms.has_location(s_reg_table[i], &location))
result.uniforms[location] = (i + rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count);
}
//Bind locations 0 and 1 to the stream buffers
result.uniforms[0] = stream_buffer_start;
result.uniforms[1] = stream_buffer_start + 1;
LOG_NOTICE(RSX, "*** prog id = %d", result.id());
LOG_NOTICE(RSX, "*** vp id = %d", vertexProgramData.id);

View file

@ -19,130 +19,6 @@ namespace
namespace
{
u32 to_gl_internal_type(rsx::vertex_base_type type, u8 size)
{
/**
* NOTE 1. The buffer texture spec only allows fetches aligned to 8, 16, 32, etc...
* This rules out most 3-component formats, except for the 32-wide RGB32F, RGB32I, RGB32UI
*
* NOTE 2. While s1 & cmp types are signed normalized 16-bit integers, some GPU vendors dont support texture buffer access
* using these formats. Pass a 16 bit unnormalized integer and convert it in the vertex shader
*/
const u32 vec1_types[] = { GL_R16I, GL_R32F, GL_R16F, GL_R8, GL_R16I, GL_RGBA16I, GL_R8UI };
const u32 vec2_types[] = { GL_RG16I, GL_RG32F, GL_RG16F, GL_RG8, GL_RG16I, GL_RGBA16I, GL_RG8UI };
const u32 vec3_types[] = { GL_RGBA16I, GL_RGB32F, GL_RGBA16F, GL_RGBA8, GL_RGBA16I, GL_RGBA16I, GL_RGBA8UI }; //VEC3 COMPONENTS NOT SUPPORTED!
const u32 vec4_types[] = { GL_RGBA16I, GL_RGBA32F, GL_RGBA16F, GL_RGBA8, GL_RGBA16I, GL_RGBA16I, GL_RGBA8UI };
const u32* vec_selectors[] = { 0, vec1_types, vec2_types, vec3_types, vec4_types };
if (type > rsx::vertex_base_type::ub256)
fmt::throw_exception("OpenGL error: unknown vertex base type 0x%x" HERE, (u32)type);
return vec_selectors[size][(int)type];
}
void prepare_buffer_for_writing(void *data, rsx::vertex_base_type type, u8 vertex_size, u32 vertex_count)
{
switch (type)
{
case rsx::vertex_base_type::sf:
{
if (vertex_size == 3)
{
/**
* Pad the 4th component for half-float arrays to 1, since texelfetch does not mask components
*/
u16 *dst = reinterpret_cast<u16*>(data);
for (u32 i = 0, idx = 3; i < vertex_count; ++i, idx += 4)
dst[idx] = 0x3c00;
}
break;
}
}
}
template<typename T, int count>
struct apply_attrib_t;
template<typename T>
struct apply_attrib_t<T, 1>
{
static void func(gl::glsl::program& program, int location, const T* data)
{
program.attribs[location] = data[0];
}
};
template<typename T>
struct apply_attrib_t<T, 2>
{
static void func(gl::glsl::program& program, int location, const T* data)
{
program.attribs[location] = color2_base<T>{ data[0], data[1] };
}
};
template<typename T>
struct apply_attrib_t<T, 3>
{
static void func(gl::glsl::program& program, int location, const T* data)
{
program.attribs[location] = color3_base<T>{ data[0], data[1], data[2] };
}
};
template<typename T>
struct apply_attrib_t<T, 4>
{
static void func(gl::glsl::program& program, int location, const T* data)
{
program.attribs[location] = color4_base<T>{ data[0], data[1], data[2], data[3] };
}
};
template<typename T, int count>
void apply_attrib_array(gl::glsl::program& program, int location, const std::vector<u8>& data)
{
for (size_t offset = 0; offset < data.size(); offset += count * sizeof(T))
{
apply_attrib_t<T, count>::func(program, location, (T*)(data.data() + offset));
}
}
gl::buffer_pointer::type gl_types(rsx::vertex_base_type type)
{
switch (type)
{
case rsx::vertex_base_type::s1: return gl::buffer_pointer::type::s16;
case rsx::vertex_base_type::f: return gl::buffer_pointer::type::f32;
case rsx::vertex_base_type::sf: return gl::buffer_pointer::type::f16;
case rsx::vertex_base_type::ub: return gl::buffer_pointer::type::u8;
case rsx::vertex_base_type::s32k: return gl::buffer_pointer::type::s32;
case rsx::vertex_base_type::cmp: return gl::buffer_pointer::type::s16; // Needs conversion
case rsx::vertex_base_type::ub256: gl::buffer_pointer::type::u8;
}
fmt::throw_exception("unknown vertex type" HERE);
}
bool gl_normalized(rsx::vertex_base_type type)
{
switch (type)
{
case rsx::vertex_base_type::s1:
case rsx::vertex_base_type::ub:
case rsx::vertex_base_type::cmp:
return true;
case rsx::vertex_base_type::f:
case rsx::vertex_base_type::sf:
case rsx::vertex_base_type::ub256:
case rsx::vertex_base_type::s32k:
return false;
}
fmt::throw_exception("unknown vertex type" HERE);
}
// return vertex count if primitive type is not native (empty array otherwise)
std::tuple<u32, u32> get_index_array_for_emulated_non_indexed_draw(const std::vector<std::pair<u32, u32>> &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst)
{
@ -195,85 +71,13 @@ namespace
throw;
}
struct vertex_buffer_visitor
struct vertex_input_state
{
vertex_buffer_visitor(u32 vtx_cnt, gl::ring_buffer& heap, gl::glsl::program* prog, gl::texture* attrib_buffer, u32 min_texbuffer_offset, gl::vertex_cache* vertex_cache)
: vertex_count(vtx_cnt)
, m_attrib_ring_info(heap)
, m_program(prog)
, m_gl_attrib_buffers(attrib_buffer)
, m_min_texbuffer_alignment(min_texbuffer_offset)
, m_vertex_cache(vertex_cache)
{
}
void operator()(const rsx::vertex_array_buffer& vertex_array)
{
int location;
if (!m_program->uniforms.has_location(s_reg_table[vertex_array.index], &location))
return;
GLenum gl_type = to_gl_internal_type(vertex_array.type, vertex_array.attribute_size);
auto& texture = m_gl_attrib_buffers[vertex_array.index];
const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
const u32 data_size = vertex_count * element_size;
const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
u32 buffer_offset = 0;
if (auto uploaded = m_vertex_cache->find_vertex_range(local_addr, gl_type, data_size))
{
buffer_offset = uploaded->offset_in_heap;
}
else
{
// Fill vertex_array
auto mapping = m_attrib_ring_info.alloc_from_heap(data_size, m_min_texbuffer_alignment);
gsl::byte* dst = static_cast<gsl::byte*>(mapping.first);
buffer_offset = mapping.second;
gsl::span<gsl::byte> dest_span(dst, data_size);
m_vertex_cache->store_range(local_addr, gl_type, data_size, buffer_offset);
write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size));
prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
}
texture.copy_from(m_attrib_ring_info, gl_type, buffer_offset, data_size);
}
void operator()(const rsx::vertex_array_register& vertex_register)
{
int location;
if (!m_program->uniforms.has_location(s_reg_table[vertex_register.index], &location))
return;
const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_register.type, vertex_register.attribute_size);
const u32 gl_type = to_gl_internal_type(vertex_register.type, vertex_register.attribute_size);
const u32 data_size = element_size;
auto& texture = m_gl_attrib_buffers[vertex_register.index];
auto mapping = m_attrib_ring_info.alloc_from_heap(data_size, m_min_texbuffer_alignment);
u8 *dst = static_cast<u8*>(mapping.first);
memcpy(dst, vertex_register.data.data(), element_size);
prepare_buffer_for_writing(dst, vertex_register.type, vertex_register.attribute_size, vertex_count);
texture.copy_from(m_attrib_ring_info, gl_type, mapping.second, data_size);
}
void operator()(const rsx::empty_vertex_array& vbo)
{
}
protected:
u32 vertex_count;
gl::ring_buffer& m_attrib_ring_info;
gl::glsl::program* m_program;
gl::texture* m_gl_attrib_buffers;
GLint m_min_texbuffer_alignment;
gl::vertex_cache* m_vertex_cache;
u32 vertex_draw_count;
u32 allocated_vertex_count;
u32 vertex_data_base;
u32 vertex_index_base;
std::optional<std::tuple<GLenum, u32>> index_info;
};
struct draw_command_visitor
@ -281,54 +85,32 @@ namespace
using attribute_storage = std::vector<
std::variant<rsx::vertex_array_buffer, rsx::vertex_array_register, rsx::empty_vertex_array>>;
draw_command_visitor(gl::ring_buffer& index_ring_buffer, gl::ring_buffer& attrib_ring_buffer,
gl::texture* gl_attrib_buffers, gl::glsl::program* program, GLint min_texbuffer_alignment,
gl::vertex_cache* vertex_cache,
std::function<attribute_storage(rsx::rsx_state, std::vector<std::pair<u32, u32>>)> gvb)
draw_command_visitor(gl::ring_buffer& index_ring_buffer, rsx::vertex_input_layout& vertex_layout)
: m_index_ring_buffer(index_ring_buffer)
, m_attrib_ring_buffer(attrib_ring_buffer)
, m_gl_attrib_buffers(gl_attrib_buffers)
, m_program(program)
, m_min_texbuffer_alignment(min_texbuffer_alignment)
, get_vertex_buffers(gvb)
, m_vertex_cache(vertex_cache)
{
for (u8 index = 0; index < rsx::limits::vertex_count; ++index) {
if (rsx::method_registers.vertex_arrays_info[index].size() ||
rsx::method_registers.register_vertex_info[index].size)
{
max_vertex_attrib_size += 16;
}
}
}
, m_vertex_layout(vertex_layout)
{}
std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
const rsx::draw_array_command& command)
vertex_input_state operator()(const rsx::draw_array_command& command)
{
u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
u32 min_index = rsx::method_registers.current_draw_clause.first_count_commands.front().first;
u32 max_index = vertex_count - 1 + min_index;
if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) {
if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
{
u32 index_count;
u32 offset_in_index_buffer;
std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
rsx::method_registers.current_draw_clause.first_count_commands,
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
upload_vertex_buffers(min_index, max_index, max_vertex_attrib_size);
return std::make_tuple(index_count,
std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer));
return{ index_count, vertex_count, min_index, 0, std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer) };
}
upload_vertex_buffers(min_index, max_index, max_vertex_attrib_size);
return std::make_tuple(vertex_count, std::optional<std::tuple<GLenum, u32>>());
return{ vertex_count, vertex_count, min_index, 0, std::optional<std::tuple<GLenum, u32>>() };
}
std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
const rsx::draw_indexed_array_command& command)
vertex_input_state operator()(const rsx::draw_indexed_array_command& command)
{
u32 min_index = 0, max_index = 0;
@ -338,7 +120,7 @@ namespace
u32 type_size = ::narrow<u32>(get_index_type_size(type));
u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
u32 index_count = vertex_count;
if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
@ -352,129 +134,114 @@ namespace
std::tie(min_index, max_index, index_count) = upload_index_buffer(
command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive,
rsx::method_registers.current_draw_clause.first_count_commands, vertex_count);
upload_vertex_buffers(0, max_index, max_vertex_attrib_size);
return std::make_tuple(index_count, std::make_tuple(get_index_type(type), offset_in_index_buffer));
//check for vertex arrays with frquency modifiers
for (auto &block : m_vertex_layout.interleaved_blocks)
{
if (block.min_divisor > 1)
{
//Ignore base offsets and return real results
//The upload function will optimize the uploaded range anyway
return{ index_count, max_index, 0, 0, std::make_tuple(get_index_type(type), offset_in_index_buffer) };
}
}
//Prefer only reading the vertices that are referenced in the index buffer itself
//Offset data source by min_index verts, but also notify the shader to offset the vertexID
return{ index_count, (max_index - min_index + 1), min_index, min_index, std::make_tuple(get_index_type(type), offset_in_index_buffer) };
}
std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
const rsx::draw_inlined_array& command)
vertex_input_state operator()(const rsx::draw_inlined_array& command)
{
// We need to go through array to determine vertex count so upload it
u32 vertex_count = upload_inline_array(max_vertex_attrib_size);
u32 vertex_count = (u32)command.inline_vertex_array.size() * sizeof(u32) / m_vertex_layout.interleaved_blocks[0].attribute_stride;
if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) {
if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
{
u32 offset_in_index_buffer;
u32 index_count;
std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
{ std::make_pair(0, vertex_count) },
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
return std::make_tuple(index_count,
std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer));
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
return{ index_count, vertex_count, 0, 0, std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer) };
}
return std::make_tuple(vertex_count, std::optional<std::tuple<GLenum, u32>>());
return{ vertex_count, vertex_count, 0, 0, std::optional<std::tuple<GLenum, u32>>() };
}
private:
u32 max_vertex_attrib_size = 0;
gl::ring_buffer& m_index_ring_buffer;
gl::ring_buffer& m_attrib_ring_buffer;
gl::texture* m_gl_attrib_buffers;
gl::vertex_cache* m_vertex_cache;
gl::glsl::program* m_program;
GLint m_min_texbuffer_alignment;
std::function<attribute_storage(rsx::rsx_state, std::vector<std::pair<u32, u32>>)>
get_vertex_buffers;
void upload_vertex_buffers(u32 min_index, u32 max_index, const u32& max_vertex_attrib_size)
{
u32 verts_allocated = max_index - min_index + 1;
vertex_buffer_visitor visitor(verts_allocated, m_attrib_ring_buffer,
m_program, m_gl_attrib_buffers, m_min_texbuffer_alignment, m_vertex_cache);
const auto& vertex_buffers =
get_vertex_buffers(rsx::method_registers, {{min_index, verts_allocated}});
for (const auto& vbo : vertex_buffers) std::apply_visitor(visitor, vbo);
}
u32 upload_inline_array(const u32& max_vertex_attrib_size)
{
u32 stride = 0;
u32 offsets[rsx::limits::vertex_count] = {0};
for (u32 i = 0; i < rsx::limits::vertex_count; ++i) {
const auto& info = rsx::method_registers.vertex_arrays_info[i];
if (!info.size()) continue;
offsets[i] = stride;
stride += rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
u32 vertex_draw_count =
(u32)(rsx::method_registers.current_draw_clause.inline_vertex_array.size() * sizeof(u32)) /
stride;
for (int index = 0; index < rsx::limits::vertex_count; ++index) {
auto& vertex_info = rsx::method_registers.vertex_arrays_info[index];
int location;
if (!m_program->uniforms.has_location(s_reg_table[index], &location)) continue;
if (!vertex_info.size())
continue;
const u32 element_size =
rsx::get_vertex_type_size_on_host(vertex_info.type(), vertex_info.size());
u32 data_size = element_size * vertex_draw_count;
u32 gl_type = to_gl_internal_type(vertex_info.type(), vertex_info.size());
auto& texture = m_gl_attrib_buffers[index];
u8* src =
reinterpret_cast<u8*>(rsx::method_registers.current_draw_clause.inline_vertex_array.data());
auto mapping = m_attrib_ring_buffer.alloc_from_heap(data_size, m_min_texbuffer_alignment);
u8* dst = static_cast<u8*>(mapping.first);
src += offsets[index];
prepare_buffer_for_writing(dst, vertex_info.type(), vertex_info.size(), vertex_draw_count);
// TODO: properly handle compressed data
for (u32 i = 0; i < vertex_draw_count; ++i) {
if (vertex_info.type() == rsx::vertex_base_type::ub && vertex_info.size() == 4) {
dst[0] = src[3];
dst[1] = src[2];
dst[2] = src[1];
dst[3] = src[0];
}
else
memcpy(dst, src, element_size);
src += stride;
dst += element_size;
}
texture.copy_from(m_attrib_ring_buffer, gl_type, mapping.second, data_size);
}
return vertex_draw_count;
}
rsx::vertex_input_layout& m_vertex_layout;
};
}
std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> GLGSRender::set_vertex_buffer()
std::tuple<u32, u32, u32, std::optional<std::tuple<GLenum, u32>>> GLGSRender::set_vertex_buffer()
{
std::chrono::time_point<steady_clock> then = steady_clock::now();
auto result = std::apply_visitor(draw_command_visitor(*m_index_ring_buffer, *m_attrib_ring_buffer,
m_gl_attrib_buffers, m_program, m_min_texbuffer_alignment,
m_vertex_cache.get(),
[this](const auto& state, const auto& list) {
return this->get_vertex_buffers(state, list, 0);
}),
get_draw_command(rsx::method_registers));
m_vertex_layout = analyse_inputs_interleaved();
//Write index buffers and count verts
auto result = std::apply_visitor(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), get_draw_command(rsx::method_registers));
auto &vertex_count = result.allocated_vertex_count;
auto &vertex_base = result.vertex_data_base;
//Do actual vertex upload
auto &required = calculate_memory_requirements(m_vertex_layout, vertex_count);
std::pair<void*, u32> persistent_mapping = {}, volatile_mapping = {};
if (required.first > 0)
{
//Check if cacheable
//Only data in the 'persistent' block may be cached
//TODO: make vertex cache keep local data beyond frame boundaries and hook notify command
bool in_cache = false;
bool to_store = false;
u32 storage_address = UINT32_MAX;
if (m_vertex_layout.interleaved_blocks.size() == 1 &&
rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array)
{
storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base;
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first))
{
in_cache = true;
m_gl_persistent_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, cached->offset_in_heap, required.first);
}
else
{
to_store = true;
}
}
if (!in_cache)
{
persistent_mapping = m_attrib_ring_buffer->alloc_from_heap(required.first, m_min_texbuffer_alignment);
m_gl_persistent_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, persistent_mapping.second, required.first);
if (to_store)
{
//store ref in vertex cache
m_vertex_cache->store_range(storage_address, GL_R8UI, required.first, persistent_mapping.second);
}
}
}
if (required.second > 0)
{
volatile_mapping = m_attrib_ring_buffer->alloc_from_heap(required.second, m_min_texbuffer_alignment);
m_gl_volatile_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, volatile_mapping.second, required.second);
}
//Write all the data
write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping.first, volatile_mapping.first);
std::chrono::time_point<steady_clock> now = steady_clock::now();
m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(now - then).count();
return result;
return std::make_tuple(result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, result.index_info);
}
namespace

View file

@ -31,57 +31,21 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
{
OS << "#version 430\n\n";
OS << "layout(std140, binding = 0) uniform ScaleOffsetBuffer\n";
OS << "layout(std140, binding = 0) uniform VertexContextBuffer\n";
OS << "{\n";
OS << " mat4 scaleOffsetMat;\n";
OS << " ivec4 userClipEnabled[2];\n";
OS << " vec4 userClipFactor[2];\n";
OS << "};\n";
OS << " mat4 scale_offset_mat;\n";
OS << " ivec4 user_clip_enabled[2];\n";
OS << " vec4 user_clip_factor[2];\n";
OS << " uint transform_branch_bits;\n";
OS << " uint vertex_base_index;\n";
OS << " ivec4 input_attributes[16];\n";
OS << "};\n\n";
}
void GLVertexDecompilerThread::insertInputs(std::stringstream & OS, const std::vector<ParamType>& inputs)
{
std::vector<std::tuple<size_t, std::string>> input_data;
for (const ParamType &PT : inputs)
{
for (const ParamItem &PI : PT.items)
{
input_data.push_back(std::make_tuple(PI.location, PI.name));
}
}
/**
* Its is important that the locations are in the order that vertex attributes are expected.
* If order is not adhered to, channels may be swapped leading to corruption
*/
std::sort(input_data.begin(), input_data.end());
int location = 1;
for (const std::tuple<size_t, std::string>& item : input_data)
{
for (const ParamType &PT : inputs)
{
for (const ParamItem &PI : PT.items)
{
if (PI.name == std::get<1>(item))
{
bool is_int = false;
for (const auto &attrib : rsx_vertex_program.rsx_vertex_inputs)
{
if (attrib.location == std::get<0>(item))
{
if (attrib.int_type || attrib.flags & GL_VP_SINT_MASK) is_int = true;
break;
}
}
std::string samplerType = is_int ? "isamplerBuffer" : "samplerBuffer";
OS << "layout(location=" << location++ << ")" << " uniform " << samplerType << " " << PI.name << "_buffer;\n";
}
}
}
}
OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n"; //Data stream with persistent vertex data (cacheable)
OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n"; //Data stream with per-draw data (registers and immediate draw data)
}
void GLVertexDecompilerThread::insertConstants(std::stringstream & OS, const std::vector<ParamType> & constants)
@ -89,7 +53,6 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream & OS, const std
OS << "layout(std140, binding = 1) uniform VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 vc[468];\n";
OS << " uint transform_branch_bits;\n";
OS << "};\n\n";
for (const ParamType &PT: constants)
@ -115,13 +78,13 @@ static const vertex_reg_info reg_table[] =
//Fog output shares a data source register with clip planes 0-2 so only declare when specified
{ "fog_c", true, "dst_reg5", ".xxxx", true, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FOG },
//Warning: Always define all 3 clip plane groups together to avoid flickering with openGL
{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * userClipFactor[0].x", false, "userClipEnabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * userClipFactor[0].y", false, "userClipEnabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * userClipFactor[0].z", false, "userClipEnabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * user_clip_factor[0].x", false, "user_clip_enabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * user_clip_factor[0].y", false, "user_clip_enabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * user_clip_factor[0].z", false, "user_clip_enabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
{ "gl_PointSize", false, "dst_reg6", ".x", false },
{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * userClipFactor[0].w", false, "userClipEnabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * userClipFactor[1].x", false, "userClipEnabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * userClipFactor[1].y", false, "userClipEnabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * user_clip_factor[0].w", false, "user_clip_enabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * user_clip_factor[1].x", false, "user_clip_enabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * user_clip_factor[1].y", false, "user_clip_enabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
{ "tc0", true, "dst_reg7", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX0 },
{ "tc1", true, "dst_reg8", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX1 },
{ "tc2", true, "dst_reg9", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX2 },
@ -206,57 +169,175 @@ namespace
}
}
void add_input(std::stringstream & OS, const ParamItem &PI, const std::vector<rsx_vertex_input> &inputs)
void insert_vertex_input_fetch(std::stringstream& OS)
{
for (const auto &real_input : inputs)
{
if (real_input.location != PI.location)
continue;
//Actually decode a vertex attribute from a raw byte stream
OS << "struct attribute_desc\n";
OS << "{\n";
OS << " int type;\n";
OS << " int attribute_size;\n";
OS << " int starting_offset;\n";
OS << " int stride;\n";
OS << " int swap_bytes;\n";
OS << " int is_volatile;\n";
OS << " int frequency;\n";
OS << " int divisor;\n";
OS << " int modulo;\n";
OS << "};\n\n";
std::string vecType = " vec4 ";
if (real_input.int_type)
vecType = " ivec4 ";
OS << "uint get_bits(uvec4 v, int swap)\n";
OS << "{\n";
OS << " if (swap != 0) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);\n";
OS << " return (v.x | v.y << 8 | v.z << 16 | v.w << 24);\n";
OS << "}\n\n";
std::string scale = "";
if (real_input.flags & GL_VP_SINT_MASK)
{
if (real_input.flags & GL_VP_ATTRIB_S16_INT)
scale = " / " + expand_to_vec4("32767.", real_input.size);
else
scale = " / " + expand_to_vec4("2147483647.", real_input.size);
}
OS << "uint get_bits(uvec2 v, int swap)\n";
OS << "{\n";
OS << " if (swap != 0) return (v.y | v.x << 8);\n";
OS << " return (v.x | v.y << 8);\n";
OS << "}\n\n";
if (!real_input.is_array)
{
OS << vecType << PI.name << " = texelFetch(" << PI.name << "_buffer, 0)" << scale << ";\n";
return;
}
OS << "int preserve_sign_s16(uint bits)\n";
OS << "{\n";
OS << " //convert raw 16 bit value into signed 32-bit integer counterpart\n";
OS << " uint sign = bits & 0x8000;\n";
OS << " if (sign != 0) return int(bits | 0xFFFF0000);\n";
OS << " return int(bits);\n";
OS << "}\n\n";
if (real_input.frequency > 1)
{
if (real_input.is_modulo)
{
OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID %" << real_input.frequency << ")" << scale << ";\n";
return;
}
OS << "float convert_to_f32(uint bits)\n";
OS << "{\n";
OS << " uint sign = (bits >> 31) & 1;\n";
OS << " uint exp = (bits >> 23) & 0xff;\n";
OS << " uint mantissa = bits & 0x7fffff;\n";
OS << " float base = (sign != 0)? -1.f: 1.f;\n";
OS << " base *= exp2(exp - 127);\n";
OS << " float scale = 0.f;\n\n";
OS << " for (int x = 0; x < 23; x++)\n";
OS << " {\n";
OS << " int inv = (22 - x);\n";
OS << " if ((mantissa & (1 << inv)) == 0) continue;\n";
OS << " scale += 1.f / pow(2.f, float(inv));\n";
OS << " }\n";
OS << " return base * scale;\n";
OS << "}\n";
OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID /" << real_input.frequency << ")" << scale << ";\n";
return;
}
OS << "#define get_s16(v, s) preserve_sign_s16(get_bits(v, s))\n\n";
OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID)" << scale << ";\n";
return;
}
OS << "vec4 fetch_attribute(attribute_desc desc, int vertex_id, usamplerBuffer input_stream)\n";
OS << "{\n";
OS << " vec4 result = vec4(0., 0., 0., 1.);\n";
OS << " vec4 scale = vec4(1.);\n";
OS << " uvec4 tmp;\n";
OS << " uint bits;\n";
OS << " bool reverse_order = false;\n";
OS << "\n";
OS << " int first_byte = (vertex_id * desc.stride) + desc.starting_offset;\n";
OS << " for (int n = 0; n < desc.attribute_size; n++)\n";
OS << " {\n";
OS << " switch (desc.type)\n";
OS << " {\n";
OS << " case 0:\n";
OS << " //signed normalized 16-bit\n";
OS << " tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
OS << " result[n] = get_s16(tmp.xy, desc.swap_bytes);\n";
OS << " scale[n] = 32767.;\n";
OS << " break;\n";
OS << " case 1:\n";
OS << " //float\n";
OS << " tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[2] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[3] = texelFetch(input_stream, first_byte++).x;\n";
OS << " result[n] = uintBitsToFloat(get_bits(tmp, desc.swap_bytes));\n";
OS << " break;\n";
OS << " case 2:\n";
OS << " //half\n";
OS << " tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
OS << " result[n] = unpackHalf2x16(uint(get_bits(tmp.xy, desc.swap_bytes))).x;\n";
OS << " break;\n";
OS << " case 3:\n";
OS << " //unsigned byte\n";
OS << " result[n] = texelFetch(input_stream, first_byte++).x;\n";
OS << " scale[n] = 255.;\n";
OS << " reverse_order = (desc.swap_bytes != 0);\n";
OS << " break;\n";
OS << " case 4:\n";
OS << " //signed word\n";
OS << " tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
OS << " result[n] = get_s16(tmp.xy, desc.swap_bytes);\n";
OS << " break;\n";
OS << " case 5:\n";
OS << " //cmp\n";
OS << " tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[2] = texelFetch(input_stream, first_byte++).x;\n";
OS << " tmp[3] = texelFetch(input_stream, first_byte++).x;\n";
OS << " bits = get_bits(tmp, desc.swap_bytes);\n";
OS << " result.x = preserve_sign_s16((bits & 0x7FF) << 5);\n";
OS << " result.y = preserve_sign_s16(((bits >> 11) & 0x7FF) << 5);\n";
OS << " result.z = preserve_sign_s16(((bits >> 22) & 0x3FF) << 6);\n";
OS << " result.w = 1.;\n";
OS << " scale = vec4(32767., 32767., 32767., 1.);\n";
OS << " break;\n";
OS << " case 6:\n";
OS << " //ub256\n";
OS << " result[n] = float(texelFetch(input_stream, first_byte++).x);\n";
OS << " reverse_order = (desc.swap_bytes != 0);\n";
OS << " break;\n";
OS << " }\n";
OS << " }\n\n";
OS << " result /= scale;\n";
OS << " return (reverse_order)? result.wzyx: result;\n";
OS << "}\n\n";
LOG_WARNING(RSX, "Vertex input %s does not have a matching vertex_input declaration", PI.name.c_str());
OS << "attribute_desc fetch_desc(int location)\n";
OS << "{\n";
OS << " attribute_desc result;\n";
OS << " int attribute_flags = input_attributes[location].w;\n";
OS << " result.type = input_attributes[location].x;\n";
OS << " result.attribute_size = input_attributes[location].y;\n";
OS << " result.starting_offset = input_attributes[location].z;\n";
OS << " result.stride = attribute_flags & 0xFF;\n";
OS << " result.swap_bytes = (attribute_flags >> 8) & 0x1;\n";
OS << " result.is_volatile = (attribute_flags >> 9) & 0x1;\n";
OS << " result.frequency = (attribute_flags >> 10) & 0x3;\n";
OS << " result.modulo = (attribute_flags >> 12) & 0x1;\n";
OS << " result.divisor = (attribute_flags >> 13) & 0xFFFF;\n";
OS << " return result;\n";
OS << "}\n\n";
OS << " vec4 " << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID);\n";
OS << "vec4 read_location(int location)\n";
OS << "{\n";
OS << " attribute_desc desc = fetch_desc(location);\n";
OS << "\n";
OS << " int vertex_id = gl_VertexID - int(vertex_base_index);\n";
OS << " if (desc.frequency == 0)\n";
OS << " vertex_id = 0;\n";
OS << " else if (desc.frequency > 1)\n";
OS << " {\n";
OS << " //if a vertex modifier is active; vertex_base must be 0 and is ignored\n";
OS << " if (desc.modulo != 0)\n";
OS << " vertex_id = gl_VertexID % desc.divisor;\n";
OS << " else\n";
OS << " vertex_id = gl_VertexID / desc.divisor;\n";
OS << " }\n";
OS << "\n";
OS << " if (desc.is_volatile != 0)\n";
OS << " return fetch_attribute(desc, vertex_id, volatile_input_stream);\n";
OS << " else\n";
OS << " return fetch_attribute(desc, vertex_id, persistent_input_stream);\n";
OS << "}\n\n";
}
}
void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
{
insert_glsl_legacy_function(OS, gl::glsl::glsl_vertex_program);
insert_vertex_input_fetch(OS);
std::string parameters = "";
for (int i = 0; i < 16; ++i)
@ -293,7 +374,9 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
for (const ParamType &PT : m_parr.params[PF_PARAM_IN])
{
for (const ParamItem &PI : PT.items)
add_input(OS, PI, rsx_vertex_program.rsx_vertex_inputs);
{
OS << " vec4 " << PI.name << "= read_location(" << std::to_string(PI.location) << ");\n";
}
}
for (const ParamType &PT : m_parr.params[PF_PARAM_UNIFORM])
@ -401,7 +484,7 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS)
if (m_parr.HasParam(PF_PARAM_NONE, "vec4", "dst_reg2"))
OS << " front_spec_color = dst_reg2;\n";
OS << " gl_Position = gl_Position * scaleOffsetMat;\n";
OS << " gl_Position = gl_Position * scale_offset_mat;\n";
//Since our clip_space is symetrical [-1, 1] we map it to linear space using the eqn:
//ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer

View file

@ -30,6 +30,7 @@ protected:
virtual void insertMainEnd(std::stringstream &OS) override;
const RSXVertexProgram &rsx_vertex_program;
std::unordered_map<std::string, int> input_locations;
public:
GLVertexDecompilerThread(const RSXVertexProgram &prog, std::string& shader, ParamArray&)
: VertexProgramDecompiler(prog)
@ -50,6 +51,7 @@ public:
ParamArray parr;
u32 id = 0;
std::string shader;
bool interleaved;
void Decompile(const RSXVertexProgram& prog);
void Compile();

View file

@ -125,7 +125,7 @@ namespace rsx
return sizeof(u8) * 4;
}
fmt::throw_exception("Wrong vector size" HERE);
case vertex_base_type::cmp: return sizeof(u16) * 4;
case vertex_base_type::cmp: return 4;
case vertex_base_type::ub256: verify(HERE), (size == 4); return sizeof(u8) * 4;
}
fmt::throw_exception("RSXVertexData::GetTypeSize: Bad vertex data type (%d)!" HERE, (u8)type);
@ -309,32 +309,6 @@ namespace rsx
return (u32)element_push_buffer.size();
}
bool thread::is_probable_instanced_draw()
{
if (!g_cfg.video.batch_instanced_geometry)
return false;
//If the array registers have not been touched, the index array has also not been touched via notify or via register set, its likely an instanced draw
//gcm lib will set the registers once and then call the same draw command over and over with different transform params to achieve this
if (m_index_buffer_changed || m_vertex_attribs_changed)
return false;
auto& draw_clause = rsx::method_registers.current_draw_clause;
if (draw_clause.command != m_last_command)
return false;
if (draw_clause.command != rsx::draw_command::inlined_array)
{
if (draw_clause.first_count_commands.back().second != m_last_first_count.second ||
draw_clause.first_count_commands.front().first != m_last_first_count.first)
return false;
}
else if (m_last_first_count.second != draw_clause.inline_vertex_array.size())
return false;
return true;
}
void thread::end()
{
rsx::method_registers.transform_constants.clear();
@ -355,17 +329,6 @@ namespace rsx
u32 element_count = rsx::method_registers.current_draw_clause.get_elements_count();
capture_frame("Draw " + rsx::to_string(rsx::method_registers.current_draw_clause.primitive) + std::to_string(element_count));
}
auto& clause = rsx::method_registers.current_draw_clause;
m_last_command = clause.command;
if (m_last_command == rsx::draw_command::inlined_array)
m_last_first_count = std::make_pair(0, (u32)clause.inline_vertex_array.size());
else
m_last_first_count = std::make_pair(clause.first_count_commands.front().first, clause.first_count_commands.back().second);
m_index_buffer_changed = false;
m_vertex_attribs_changed = false;
}
void thread::on_task()
@ -543,20 +506,6 @@ namespace rsx
m_vblank_thread->join();
m_vblank_thread.reset();
}
if (m_vertex_streaming_task.available_threads > 0)
{
for (auto &task : m_vertex_streaming_task.worker_threads)
{
if (!task.worker_thread)
break;
task.worker_thread->join();
task.worker_thread.reset();
}
m_vertex_streaming_task.available_threads = 0;
}
}
std::string thread::get_name() const
@ -920,9 +869,10 @@ namespace rsx
return rsx::get_address(offset_zeta, m_context_dma_z);
}
RSXVertexProgram thread::get_current_vertex_program() const
void thread::get_current_vertex_program()
{
RSXVertexProgram result = {};
auto &result = current_vertex_program = {};
const u32 transform_program_start = rsx::method_registers.transform_program_start();
result.data.reserve((512 - transform_program_start) * 4);
result.rsx_vertex_inputs.reserve(rsx::limits::vertex_count);
@ -980,16 +930,144 @@ namespace rsx
is_int_type(rsx::method_registers.vertex_arrays_info[index].type()), 0});
}
}
}
vertex_input_layout thread::analyse_inputs_interleaved() const
{
const rsx_state& state = rsx::method_registers;
const u32 input_mask = state.vertex_attrib_input_mask();
if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
{
vertex_input_layout result = {};
interleaved_range_info info = {};
info.interleaved = true;
info.locations.reserve(8);
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
const u32 mask = (1u << index);
auto &vinfo = state.vertex_arrays_info[index];
if (vinfo.size() > 0)
{
info.locations.push_back(index);
info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size());
result.attribute_placement[index] = attribute_buffer_placement::transient;
}
}
result.interleaved_blocks.push_back(info);
return result;
}
const u32 frequency_divider_mask = rsx::method_registers.frequency_divider_operation_mask();
vertex_input_layout result = {};
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
const bool enabled = !!(input_mask & (1 << index));
if (!enabled)
continue;
if (vertex_push_buffers[index].size > 0)
{
std::pair<u8, u32> volatile_range_info = std::make_pair(index, vertex_push_buffers[index].data.size() * (u32)sizeof(u32));
result.volatile_blocks.push_back(volatile_range_info);
result.attribute_placement[index] = attribute_buffer_placement::transient;
continue;
}
//Check for interleaving
auto &info = state.vertex_arrays_info[index];
if (info.size() == 0 && state.register_vertex_info[index].size > 0)
{
//Reads from register
result.referenced_registers.push_back(index);
result.attribute_placement[index] = attribute_buffer_placement::transient;
continue;
}
if (info.size() > 0)
{
result.attribute_placement[index] = attribute_buffer_placement::persistent;
const u32 base_address = info.offset() & 0x7fffffff;
bool alloc_new_block = true;
for (auto &block : result.interleaved_blocks)
{
if (block.attribute_stride != info.stride())
{
//Stride does not match, continue
continue;
}
if (base_address > block.base_offset)
{
const u32 diff = base_address - block.base_offset;
if (diff > info.stride())
{
//Not interleaved, continue
continue;
}
}
else
{
const u32 diff = block.base_offset - base_address;
if (diff > info.stride())
{
//Not interleaved, continue
continue;
}
//Matches, and this address is lower than existing
block.base_offset = base_address;
}
alloc_new_block = false;
block.locations.push_back(index);
block.interleaved = true;
block.min_divisor = std::min(block.min_divisor, info.frequency());
if (block.all_modulus)
block.all_modulus = !!(frequency_divider_mask & (1 << index));
break;
}
if (alloc_new_block)
{
interleaved_range_info block = {};
block.base_offset = base_address;
block.attribute_stride = info.stride();
block.memory_location = info.offset() >> 31;
block.locations.reserve(4);
block.locations.push_back(index);
block.min_divisor = info.frequency();
block.all_modulus = !!(frequency_divider_mask & (1 << index));
result.interleaved_blocks.push_back(block);
}
}
}
for (auto &info : result.interleaved_blocks)
{
//Calculate real data address to be used during upload
info.real_offset_address = state.vertex_data_base_offset() + rsx::get_address(info.base_offset, info.memory_location);
}
return result;
}
RSXFragmentProgram thread::get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info) const
void thread::get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info)
{
RSXFragmentProgram result = {};
auto &result = current_fragment_program = {};
const u32 shader_program = rsx::method_registers.shader_program_address();
if (shader_program == 0)
return result;
return;
const u32 program_location = (shader_program & 0x3) - 1;
const u32 program_offset = (shader_program & ~0x3);
@ -1064,8 +1142,6 @@ namespace rsx
}
result.set_texture_dimension(texture_dimensions);
return result;
}
void thread::reset()
@ -1141,121 +1217,282 @@ namespace rsx
}
}
void thread::post_vertex_stream_to_upload(gsl::span<const gsl::byte> src, gsl::span<gsl::byte> dst, rsx::vertex_base_type type, u32 vector_element_count,
u32 attribute_src_stride, u8 dst_stride, u32 vertex_count, std::function<void(void *, rsx::vertex_base_type, u8, u32)> callback)
std::pair<u32, u32> thread::calculate_memory_requirements(vertex_input_layout& layout, const u32 vertex_count)
{
upload_stream_packet packet;
packet.dst_span = dst;
packet.src_span = src;
packet.src_stride = attribute_src_stride;
packet.type = type;
packet.dst_stride = dst_stride;
packet.vector_width = vector_element_count;
packet.post_upload_func = callback;
packet.vertex_count = vertex_count;
u32 persistent_memory_size = 0;
u32 volatile_memory_size = 0;
if (m_vertex_streaming_task.available_threads == 0)
volatile_memory_size += (u32)layout.referenced_registers.size() * 16u;
if (rsx::method_registers.current_draw_clause.is_immediate_draw)
{
const u32 streaming_thread_count = (u32)g_cfg.video.vertex_upload_threads;
m_vertex_streaming_task.available_threads = streaming_thread_count;
for (u32 n = 0; n < streaming_thread_count; ++n)
for (auto &info : layout.volatile_blocks)
{
thread_ctrl::spawn(m_vertex_streaming_task.worker_threads[n].worker_thread, "Vertex Stream " + std::to_string(n), [this, n]()
volatile_memory_size += info.second;
}
}
else if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
for (auto &block : layout.interleaved_blocks)
{
volatile_memory_size += block.attribute_stride * vertex_count;
}
}
else
{
for (auto &block : layout.interleaved_blocks)
{
u32 unique_verts;
if (block.min_divisor > 1)
{
auto &owner = m_vertex_streaming_task;
auto &task = m_vertex_streaming_task.worker_threads[n];
const u32 index = n;
while (!Emu.IsStopped())
if (block.all_modulus)
unique_verts = block.min_divisor;
else
{
if (task.thread_status.load(std::memory_order_consume) != 0)
{
for (auto &packet: task.packets)
{
write_vertex_array_data_to_buffer(packet.dst_span, packet.src_span, packet.vertex_count, packet.type, packet.vector_width, packet.src_stride, packet.dst_stride);
if (packet.post_upload_func)
packet.post_upload_func(packet.dst_span.data(), packet.type, (u8)packet.vector_width, packet.vertex_count);
owner.remaining_tasks--;
}
task.packets.resize(0);
task.thread_status.store(0);
_mm_sfence();
}
std::this_thread::yield();
unique_verts = vertex_count / block.min_divisor;
if (vertex_count % block.min_divisor) unique_verts++;
}
});
}
else
unique_verts = vertex_count;
persistent_memory_size += block.attribute_stride * unique_verts;
}
}
//Increment job counter..
m_vertex_streaming_task.remaining_tasks++;
return std::make_pair(persistent_memory_size, volatile_memory_size);
}
//Assign this packet to a thread
//Simple round robin based on first available thread
upload_stream_worker *best_fit = nullptr;
for (auto &worker : m_vertex_streaming_task.worker_threads)
void thread::fill_vertex_layout_state(vertex_input_layout& layout, const u32 vertex_count, s32* buffer)
{
std::array<u32, 16> offset_in_block = {};
u32 volatile_offset = 0;
u32 persistent_offset = 0;
if (rsx::method_registers.current_draw_clause.is_immediate_draw)
{
if (!worker.worker_thread)
break;
if (worker.thread_status.load(std::memory_order_consume) == 0)
for (auto &info : layout.volatile_blocks)
{
if (worker.packets.size() == 0)
offset_in_block[info.first] = volatile_offset;
volatile_offset += info.second;
}
}
for (u8 index : layout.referenced_registers)
{
offset_in_block[index] = volatile_offset;
volatile_offset += 16;
}
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
auto &block = layout.interleaved_blocks[0];
for (u8 index : block.locations)
{
auto &info = rsx::method_registers.vertex_arrays_info[index];
offset_in_block[index] = persistent_offset; //just because this var is 0 when we enter here; inlined is transient memory
persistent_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size());
}
}
else
{
for (auto &block : layout.interleaved_blocks)
{
for (u8 index : block.locations)
{
worker.packets.push_back(packet);
return;
const u32 local_address = (rsx::method_registers.vertex_arrays_info[index].offset() & 0x7fffffff);
offset_in_block[index] = persistent_offset + (local_address - block.base_offset);
}
if (best_fit == nullptr)
best_fit = &worker;
else if (best_fit->packets.size() > worker.packets.size())
best_fit = &worker;
u32 unique_verts;
if (block.min_divisor > 1)
{
if (block.all_modulus)
unique_verts = block.min_divisor;
else
{
unique_verts = vertex_count / block.min_divisor;
if (vertex_count % block.min_divisor) unique_verts++;
}
}
else
unique_verts = vertex_count;
persistent_offset += block.attribute_stride * unique_verts;
}
}
best_fit->packets.push_back(packet);
}
//Fill the data
memset(buffer, 0, 256);
void thread::start_vertex_upload_task()
{
for (auto &worker : m_vertex_streaming_task.worker_threads)
const s32 swap_storage_mask = (1 << 8);
const s32 volatile_storage_mask = (1 << 9);
const s32 default_frequency_mask = (1 << 10);
const s32 repeating_frequency_mask = (3 << 10);
const s32 input_function_modulo_mask = (1 << 12);
const s32 input_divisor_mask = (0xFFFF << 13);
const u32 modulo_mask = rsx::method_registers.frequency_divider_operation_mask();
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
if (!worker.worker_thread)
break;
if (layout.attribute_placement[index] == attribute_buffer_placement::none)
continue;
if (worker.thread_status.load(std::memory_order_consume) == 0 && worker.packets.size() > 0)
rsx::vertex_base_type type = {};
s32 size = 0;
s32 attributes = 0;
bool is_be_type = true;
if (layout.attribute_placement[index] == attribute_buffer_placement::transient)
{
worker.thread_status.store(1);
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
auto &info = rsx::method_registers.vertex_arrays_info[index];
type = info.type();
size = (s32)info.size();
attributes = layout.interleaved_blocks[0].attribute_stride;
attributes |= default_frequency_mask | volatile_storage_mask;
is_be_type = false;
}
else if (rsx::method_registers.current_draw_clause.is_immediate_draw)
{
auto &info = rsx::method_registers.register_vertex_info[index];
type = info.type;
size = (s32)info.size;
attributes = rsx::get_vertex_type_size_on_host(type, size);
attributes |= default_frequency_mask | volatile_storage_mask;
is_be_type = true;
}
else
{
//Register
auto& info = rsx::method_registers.register_vertex_info[index];
type = info.type;
size = (s32)info.size;
attributes = rsx::get_vertex_type_size_on_host(type, size);
attributes |= volatile_storage_mask;
is_be_type = false;
}
}
else
{
auto &info = rsx::method_registers.vertex_arrays_info[index];
type = info.type();
size = info.size();
const u32 frequency = info.frequency();
switch (frequency)
{
case 0:
case 1:
attributes |= default_frequency_mask;
break;
default:
{
if (modulo_mask & (1 << index))
attributes |= input_function_modulo_mask;
attributes |= repeating_frequency_mask;
attributes |= (frequency << 13) & input_divisor_mask;
}
}
attributes |= info.stride();
} //end attribute placement check
switch (type)
{
case rsx::vertex_base_type::cmp:
size = 1;
//fall through
default:
if (is_be_type) attributes |= swap_storage_mask;
break;
case rsx::vertex_base_type::ub:
case rsx::vertex_base_type::ub256:
if (!is_be_type) attributes |= swap_storage_mask;
break;
}
buffer[index * 4 + 0] = (s32)type;
buffer[index * 4 + 1] = size;
buffer[index * 4 + 2] = (s32)offset_in_block[index];
buffer[index * 4 + 3] = attributes;
}
}
void thread::write_vertex_data_to_memory(vertex_input_layout &layout, const u32 first_vertex, const u32 vertex_count, void *persistent_data, void *volatile_data)
{
char *transient = (char *)volatile_data;
char *persistent = (char *)persistent_data;
auto &draw_call = rsx::method_registers.current_draw_clause;
if (transient != nullptr)
{
if (draw_call.command == rsx::draw_command::inlined_array)
{
memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32));
//Is it possible to reference data outside of the inlined array?
return;
}
for (u8 index : layout.referenced_registers)
{
memcpy(transient, rsx::method_registers.register_vertex_info[index].data.data(), 16);
transient += 16;
}
if (draw_call.is_immediate_draw)
{
for (auto &info : layout.volatile_blocks)
{
memcpy(transient, vertex_push_buffers[info.first].data.data(), info.second);
transient += info.second;
}
return;
}
}
}
void thread::wait_for_vertex_upload_task()
{
while (m_vertex_streaming_task.remaining_tasks.load(std::memory_order_consume) != 0 && !Emu.IsStopped())
if (persistent != nullptr)
{
_mm_pause();
for (auto &block : layout.interleaved_blocks)
{
u32 unique_verts;
u32 vertex_base = first_vertex * block.attribute_stride;
if (block.min_divisor > 1)
{
if (block.all_modulus)
unique_verts = block.min_divisor;
else
{
unique_verts = vertex_count / block.min_divisor;
if (vertex_count % block.min_divisor) unique_verts++;
}
}
else
unique_verts = vertex_count;
const u32 data_size = block.attribute_stride * unique_verts;
memcpy(persistent, (char*)vm::base(block.real_offset_address) + vertex_base, data_size);
persistent += data_size;
}
}
}
bool thread::vertex_upload_task_ready()
{
if (g_cfg.video.vertex_upload_threads < 2)
return false;
//Not initialized
if (m_vertex_streaming_task.available_threads == 0)
return true;
//At least two threads are available
return (m_vertex_streaming_task.remaining_tasks < (m_vertex_streaming_task.available_threads - 1));
}
void thread::flip(int buffer)
{
if (g_cfg.video.frame_skip_enabled)

View file

@ -105,6 +105,35 @@ namespace rsx
std::vector<u32> inline_vertex_array;
};
struct interleaved_range_info
{
bool interleaved = false;
bool all_modulus = false;
u32 base_offset = 0;
u32 real_offset_address = 0;
u8 memory_location = 0;
u8 attribute_stride = 0;
u16 min_divisor = 0;
std::vector<u8> locations;
};
enum attribute_buffer_placement : u8
{
none = 0,
persistent = 1,
transient = 2
};
struct vertex_input_layout
{
std::vector<interleaved_range_info> interleaved_blocks; //Interleaved blocks to be uploaded as-is
std::vector<std::pair<u8, u32>> volatile_blocks; //Volatile data blocks (immediate draw vertex data for example)
std::vector<u8> referenced_registers; //Volatile register data
std::array<attribute_buffer_placement, 16> attribute_placement;
};
class thread : public named_thread
{
std::shared_ptr<thread_ctrl> m_vblank_thread;
@ -152,8 +181,6 @@ namespace rsx
bool m_rtts_dirty;
bool m_transform_constants_dirty;
bool m_textures_dirty[16];
bool m_vertex_attribs_changed;
bool m_index_buffer_changed;
protected:
s32 m_skip_frame_ctr = 0;
@ -161,14 +188,23 @@ namespace rsx
protected:
std::array<u32, 4> get_color_surface_addresses() const;
u32 get_zeta_surface_address() const;
RSXVertexProgram get_current_vertex_program() const;
/**
* Analyze vertex inputs and group all interleaved blocks
*/
vertex_input_layout analyse_inputs_interleaved() const;
RSXVertexProgram current_vertex_program = {};
RSXFragmentProgram current_fragment_program = {};
void get_current_vertex_program();
/**
* Gets current fragment program and associated fragment state
* get_surface_info is a helper takes 2 parameters: rsx_texture_address and surface_is_depth
* returns whether surface is a render target and surface pitch in native format
*/
RSXFragmentProgram get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info) const;
void get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info);
public:
double fps_limit = 59.94;
@ -232,7 +268,7 @@ namespace rsx
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
get_draw_command(const rsx::rsx_state& state) const;
/*
/**
* Immediate mode rendering requires a temp push buffer to hold attrib values
* Appends a value to the push buffer (currently only supports 32-wide types)
*/
@ -243,47 +279,24 @@ namespace rsx
u32 get_push_buffer_index_count() const;
protected:
//Save draw call parameters to detect instanced renders
std::pair<u32, u32> m_last_first_count;
rsx::draw_command m_last_command;
bool is_probable_instanced_draw();
/**
* Computes VRAM requirements needed to upload raw vertex streams
* result.first contains persistent memory requirements
* result.second contains volatile memory requirements
*/
std::pair<u32, u32> calculate_memory_requirements(vertex_input_layout& layout, const u32 vertex_count);
public:
//MT vertex streaming
struct upload_stream_packet
{
std::function<void(void *, rsx::vertex_base_type, u8, u32)> post_upload_func;
gsl::span<const gsl::byte> src_span;
gsl::span<gsl::byte> dst_span;
rsx::vertex_base_type type;
u32 vector_width;
u32 vertex_count;
u32 src_stride;
u8 dst_stride;
};
/**
* Generates vertex input descriptors as an array of 16x4 s32s
*/
void fill_vertex_layout_state(vertex_input_layout& layout, const u32 vertex_count, s32* buffer);
struct upload_stream_worker
{
std::shared_ptr<thread_ctrl> worker_thread;
std::vector<upload_stream_packet> packets;
std::atomic<int> thread_status = { 0 };
};
struct upload_stream_task
{
std::array<upload_stream_worker, 16> worker_threads;
int available_threads = 0;
std::atomic<int> remaining_tasks = {0};
};
upload_stream_task m_vertex_streaming_task;
void post_vertex_stream_to_upload(gsl::span<const gsl::byte> src, gsl::span<gsl::byte> dst, rsx::vertex_base_type type,
u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, u32 vertex_count,
std::function<void(void *, rsx::vertex_base_type, u8, u32)> callback);
void start_vertex_upload_task();
void wait_for_vertex_upload_task();
bool vertex_upload_task_ready();
/**
* Uploads vertex data described in the layout descriptor
* Copies from local memory to the write-only output buffers provided in a sequential manner
*/
void write_vertex_data_to_memory(vertex_input_layout &layout, const u32 first_vertex, const u32 vertex_count, void *persistent_data, void *volatile_data);
private:
std::mutex m_mtx_task;

View file

@ -228,4 +228,5 @@ struct RSXVertexProgram
std::vector<u32> data;
std::vector<rsx_vertex_input> rsx_vertex_inputs;
u32 output_mask;
bool skip_vertex_input_check;
};

View file

@ -736,6 +736,8 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
if (!flushable)
return false;
close_render_pass();
if (synchronized)
{
if (m_last_flushable_cb >= 0)
@ -817,7 +819,6 @@ void VKGSRender::begin()
m_vertex_cache->purge();
CHECK_RESULT(vkResetDescriptorPool(*m_device, descriptor_pool, 0));
m_last_descriptor_set = VK_NULL_HANDLE;
m_used_descriptors = 0;
m_uniform_buffer_ring_info.reset_allocation_stats();
@ -858,20 +859,6 @@ void VKGSRender::begin()
m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
}
void VKGSRender::emit_geometry_instance(u32/* instance_count*/)
{
begin_render_pass();
m_instanced_draws++;
//Repeat last command
if (!m_last_draw_indexed)
vkCmdDraw(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0);
else
{
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, m_last_ib_offset, m_last_ib_type);
vkCmdDrawIndexed(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0, 0);
}
}
void VKGSRender::begin_render_pass()
{
@ -916,39 +903,8 @@ void VKGSRender::end()
std::chrono::time_point<steady_clock> program_start = steady_clock::now();
const bool is_instanced = is_probable_instanced_draw() && m_last_descriptor_set != VK_NULL_HANDLE && m_program != nullptr;
if (is_instanced)
{
//Copy descriptor set
VkCopyDescriptorSet copy_info[39];
u8 descriptors_count = 0;
for (u8 i = 0; i < 39; ++i)
{
if ((m_program->attribute_location_mask & (1ull << i)) == 0)
continue;
const u8 n = descriptors_count;
copy_info[n] = {};
copy_info[n].sType = VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET;
copy_info[n].srcSet = m_last_descriptor_set;
copy_info[n].dstSet = descriptor_sets;
copy_info[n].srcBinding = i;
copy_info[n].dstBinding = i;
copy_info[n].srcArrayElement = 0;
copy_info[n].dstArrayElement = 0;
copy_info[n].descriptorCount = 1;
descriptors_count++;
}
vkUpdateDescriptorSets(*m_device, 0, nullptr, descriptors_count, (const VkCopyDescriptorSet*)&copy_info);
}
//Load program here since it is dependent on vertex state
if (!load_program(is_instanced))
if (!load_program())
{
LOG_ERROR(RSX, "No valid program bound to pipeline. Skipping draw");
rsx::thread::end();
@ -958,18 +914,6 @@ void VKGSRender::end()
std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
//m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
if (is_instanced)
{
//Only the program constants descriptors should have changed
vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline);
vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &descriptor_sets, 0, nullptr);
emit_geometry_instance(1);
m_last_instanced_cb_index = m_current_cb_index;
rsx::thread::end();
return;
}
close_render_pass(); //Texture upload stuff conflicts active RPs
if (g_cfg.video.strict_rendering_mode)
@ -1173,9 +1117,6 @@ void VKGSRender::end()
{
const auto vertex_count = std::get<1>(upload_info);
vkCmdDraw(*m_current_command_buffer, vertex_count, 1, 0, 0);
m_last_vertex_count = vertex_count;
m_last_draw_indexed = false;
}
else
{
@ -1187,16 +1128,8 @@ void VKGSRender::end()
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0);
m_last_draw_indexed = false;
m_last_ib_type = index_type;
m_last_ib_offset = offset;
m_last_vertex_count = index_count;
}
m_last_instanced_cb_index = ~0;
m_last_descriptor_set = descriptor_sets;
vk::leave_uninterruptible();
std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
@ -1207,15 +1140,17 @@ void VKGSRender::end()
if (g_cfg.video.overlay)
{
if (m_last_vertex_count < 1024)
const auto vertex_count = std::get<1>(upload_info);
if (vertex_count < 1024)
m_uploads_small++;
else if (m_last_vertex_count < 2048)
else if (vertex_count < 2048)
m_uploads_1k++;
else if (m_last_vertex_count < 4096)
else if (vertex_count < 4096)
m_uploads_2k++;
else if (m_last_vertex_count < 8192)
else if (vertex_count < 8192)
m_uploads_4k++;
else if (m_last_vertex_count < 16384)
else if (vertex_count < 16384)
m_uploads_8k++;
else
m_uploads_16k++;
@ -1433,12 +1368,6 @@ void VKGSRender::copy_render_targets_to_dma_location()
void VKGSRender::flush_command_queue(bool hard_sync)
{
if (m_attrib_ring_info.mapped)
{
wait_for_vertex_upload_task();
m_attrib_ring_info.unmap();
}
close_render_pass();
close_and_submit_command_buffer({}, m_current_command_buffer->submit_fence);
@ -1580,187 +1509,182 @@ bool VKGSRender::do_method(u32 cmd, u32 arg)
}
}
bool VKGSRender::load_program(bool fast_update)
bool VKGSRender::load_program(bool)
{
RSXVertexProgram vertex_program;
RSXFragmentProgram fragment_program;
auto &vertex_program = current_vertex_program;
auto &fragment_program = current_fragment_program;
if (!fast_update)
auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple<bool, u16>
{
auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple<bool, u16>
{
vk::render_target *surface = nullptr;
if (!is_depth)
surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr);
else
surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr);
if (!surface) return std::make_tuple(false, 0);
return std::make_tuple(true, surface->native_pitch);
};
fragment_program = get_current_fragment_program(rtt_lookup_func);
if (!fragment_program.valid) return false;
vertex_program = get_current_vertex_program();
vk::pipeline_props properties = {};
properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
bool unused;
properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused);
if (rsx::method_registers.restart_index_enabled())
{
properties.ia.primitiveRestartEnable = VK_TRUE;
}
vk::render_target *surface = nullptr;
if (!is_depth)
surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr);
else
properties.ia.primitiveRestartEnable = VK_FALSE;
surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr);
if (!surface) return std::make_tuple(false, 0);
return std::make_tuple(true, surface->native_pitch);
};
get_current_fragment_program(rtt_lookup_func);
if (!fragment_program.valid) return false;
get_current_vertex_program();
vk::pipeline_props properties = {};
properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
bool unused;
properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused);
if (rsx::method_registers.restart_index_enabled())
{
properties.ia.primitiveRestartEnable = VK_TRUE;
}
else
properties.ia.primitiveRestartEnable = VK_FALSE;
for (int i = 0; i < 4; ++i)
{
properties.att_state[i].colorWriteMask = 0xf;
properties.att_state[i].blendEnable = VK_FALSE;
}
for (int i = 0; i < 4; ++i)
{
properties.att_state[i].colorWriteMask = 0xf;
properties.att_state[i].blendEnable = VK_FALSE;
}
VkColorComponentFlags mask = 0;
if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT;
if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT;
if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT;
if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT;
VkColorComponentFlags mask = 0;
if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT;
if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT;
if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT;
if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT;
VkColorComponentFlags color_masks[4] = { mask };
VkColorComponentFlags color_masks[4] = { mask };
u8 render_targets[] = { 0, 1, 2, 3 };
u8 render_targets[] = { 0, 1, 2, 3 };
for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
{
properties.att_state[render_targets[idx]].colorWriteMask = mask;
}
if (rsx::method_registers.blend_enabled())
{
VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());
for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
{
properties.att_state[render_targets[idx]].colorWriteMask = mask;
properties.att_state[render_targets[idx]].blendEnable = VK_TRUE;
properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb;
properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb;
properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a;
properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a;
properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb;
properties.att_state[render_targets[idx]].alphaBlendOp = equation_a;
}
if (rsx::method_registers.blend_enabled())
{
VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());
for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
{
properties.att_state[render_targets[idx]].blendEnable = VK_TRUE;
properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb;
properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb;
properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a;
properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a;
properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb;
properties.att_state[render_targets[idx]].alphaBlendOp = equation_a;
}
auto blend_colors = rsx::get_constant_blend_colors();
properties.cs.blendConstants[0] = blend_colors[0];
properties.cs.blendConstants[1] = blend_colors[1];
properties.cs.blendConstants[2] = blend_colors[2];
properties.cs.blendConstants[3] = blend_colors[3];
}
else
{
for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
{
properties.att_state[render_targets[idx]].blendEnable = VK_FALSE;
}
}
properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
properties.cs.attachmentCount = m_draw_buffers_count;
properties.cs.pAttachments = properties.att_state;
if (rsx::method_registers.logic_op_enabled())
{
properties.cs.logicOpEnable = true;
properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation());
}
properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE;
if (rsx::method_registers.depth_bounds_test_enabled())
{
properties.ds.depthBoundsTestEnable = VK_TRUE;
properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min();
properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max();
}
else
properties.ds.depthBoundsTestEnable = VK_FALSE;
if (rsx::method_registers.stencil_test_enabled())
{
properties.ds.stencilTestEnable = VK_TRUE;
properties.ds.front.writeMask = rsx::method_registers.stencil_mask();
properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask();
properties.ds.front.reference = rsx::method_registers.stencil_func_ref();
properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail());
properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass());
properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail());
properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func());
if (rsx::method_registers.two_sided_stencil_test_enabled())
{
properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask();
properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask();
properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref();
properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail());
properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass());
properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail());
properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func());
}
else
properties.ds.back = properties.ds.front;
}
else
properties.ds.stencilTestEnable = VK_FALSE;
if (rsx::method_registers.depth_test_enabled())
{
properties.ds.depthTestEnable = VK_TRUE;
properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func());
}
else
properties.ds.depthTestEnable = VK_FALSE;
properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
properties.rs.polygonMode = VK_POLYGON_MODE_FILL;
properties.rs.depthClampEnable = VK_FALSE;
properties.rs.rasterizerDiscardEnable = VK_FALSE;
properties.rs.depthBiasEnable = VK_FALSE;
if (rsx::method_registers.cull_face_enabled())
properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode());
else
properties.rs.cullMode = VK_CULL_MODE_NONE;
properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode());
size_t idx = vk::get_render_pass_location(
vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first,
vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()),
(u8)m_draw_buffers_count);
properties.render_pass = m_render_passes[idx];
properties.num_targets = m_draw_buffers_count;
vk::enter_uninterruptible();
//Load current program from buffer
m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get();
auto blend_colors = rsx::get_constant_blend_colors();
properties.cs.blendConstants[0] = blend_colors[0];
properties.cs.blendConstants[1] = blend_colors[1];
properties.cs.blendConstants[2] = blend_colors[2];
properties.cs.blendConstants[3] = blend_colors[3];
}
else
vk::enter_uninterruptible();
{
for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
{
properties.att_state[render_targets[idx]].blendEnable = VK_FALSE;
}
}
properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
properties.cs.attachmentCount = m_draw_buffers_count;
properties.cs.pAttachments = properties.att_state;
if (rsx::method_registers.logic_op_enabled())
{
properties.cs.logicOpEnable = true;
properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation());
}
properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE;
if (rsx::method_registers.depth_bounds_test_enabled())
{
properties.ds.depthBoundsTestEnable = VK_TRUE;
properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min();
properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max();
}
else
properties.ds.depthBoundsTestEnable = VK_FALSE;
if (rsx::method_registers.stencil_test_enabled())
{
properties.ds.stencilTestEnable = VK_TRUE;
properties.ds.front.writeMask = rsx::method_registers.stencil_mask();
properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask();
properties.ds.front.reference = rsx::method_registers.stencil_func_ref();
properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail());
properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass());
properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail());
properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func());
if (rsx::method_registers.two_sided_stencil_test_enabled())
{
properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask();
properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask();
properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref();
properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail());
properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass());
properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail());
properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func());
}
else
properties.ds.back = properties.ds.front;
}
else
properties.ds.stencilTestEnable = VK_FALSE;
if (rsx::method_registers.depth_test_enabled())
{
properties.ds.depthTestEnable = VK_TRUE;
properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func());
}
else
properties.ds.depthTestEnable = VK_FALSE;
properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
properties.rs.polygonMode = VK_POLYGON_MODE_FILL;
properties.rs.depthClampEnable = VK_FALSE;
properties.rs.rasterizerDiscardEnable = VK_FALSE;
properties.rs.depthBiasEnable = VK_FALSE;
if (rsx::method_registers.cull_face_enabled())
properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode());
else
properties.rs.cullMode = VK_CULL_MODE_NONE;
properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode());
size_t idx = vk::get_render_pass_location(
vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first,
vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()),
(u8)m_draw_buffers_count);
properties.render_pass = m_render_passes[idx];
properties.num_targets = m_draw_buffers_count;
vk::enter_uninterruptible();
//Load current program from buffer
m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get();
//TODO: Update constant buffers..
//1. Update scale-offset matrix
@ -1781,7 +1705,7 @@ bool VKGSRender::load_program(bool fast_update)
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, scale_offset_offset, 256 }, SCALE_OFFSET_BIND_SLOT, descriptor_sets);
if (!fast_update || m_transform_constants_dirty)
if (true)//m_transform_constants_dirty)
{
const size_t vertex_constants_offset = m_uniform_buffer_ring_info.alloc<256>(512 * 4 * sizeof(float));
buf = (u8*)m_uniform_buffer_ring_info.map(vertex_constants_offset, 512 * 4 * sizeof(float));
@ -1793,21 +1717,18 @@ bool VKGSRender::load_program(bool fast_update)
m_transform_constants_dirty = false;
}
if (!fast_update)
{
const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float));
const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz);
const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float));
const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz);
buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz);
if (fragment_constants_sz)
m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) }, fragment_program);
buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz);
if (fragment_constants_sz)
m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) }, fragment_program);
fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program);
m_uniform_buffer_ring_info.unmap();
fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program);
m_uniform_buffer_ring_info.unmap();
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);
}
m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);
vk::leave_uninterruptible();

View file

@ -200,14 +200,6 @@ private:
std::thread::id rsx_thread;
VkPrimitiveTopology m_last_primititve_type;
VkIndexType m_last_ib_type;
VkDescriptorSet m_last_descriptor_set;
size_t m_last_ib_offset;
u32 m_last_vertex_count;
bool m_last_draw_indexed;
u32 m_last_instanced_cb_index;
bool render_pass_open = false;
#ifdef __linux__
@ -233,8 +225,6 @@ private:
void begin_render_pass();
void close_render_pass();
void emit_geometry_instance(u32 instance_count);
/// returns primitive topology, is_indexed, index_count, offset in index buffer, index type
std::tuple<VkPrimitiveTopology, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
public:

View file

@ -265,10 +265,18 @@ namespace
return;
// Fill vertex_array
u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
const u32 upload_size = real_element_size * vertex_count;
const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);
const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
u32 upload_size = real_element_size * vertex_count;
if (auto found = vertex_cache->find_vertex_range(local_addr, format, upload_size))
{
m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, found->offset_in_heap, upload_size));
m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_array.index], descriptor_sets);
return;
}
VkDeviceSize offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(upload_size);
void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
@ -281,9 +289,7 @@ namespace
vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
m_attrib_ring_info.unmap();
const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);
const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
vertex_cache->store_range(local_addr, format, upload_size, (u32)offset_in_attrib_buffer);
m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, upload_size));
@ -474,137 +480,9 @@ namespace
const auto& vertex_buffers = get_vertex_buffers(
rsx::method_registers, {{min_index, vertex_max_index - min_index + 1}});
//1. Check if we can get all these allocations at once
std::vector<size_t> memory_allocations(16);
std::vector<u32> allocated_sizes(16);
std::vector<int> upload_jobs(16);
memory_allocations.resize(0);
allocated_sizes.resize(0);
upload_jobs.resize(0);
for (int i = 0; i < vertex_buffers.size(); ++i)
{
const auto &vbo = vertex_buffers[i];
bool can_multithread = false;
if (vbo.which() == 0)
{
//vertex array buffer. We can thread this thing heavily
const auto& v = vbo.get<rsx::vertex_array_buffer>();
const u32 element_size = rsx::get_vertex_type_size_on_host(v.type, v.attribute_size);
const u32 real_element_size = vk::get_suitable_vk_size(v.type, v.attribute_size);
const u32 upload_size = real_element_size * vertex_count;
const VkFormat format = vk::get_suitable_vk_format(v.type, v.attribute_size);
const uintptr_t local_addr = (uintptr_t)v.data.data();
const auto cached = rsxthr->m_vertex_cache->find_vertex_range(local_addr, format, upload_size);
if (cached)
{
m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device, m_attrib_ring_info.heap->value, format, cached->offset_in_heap, upload_size));
m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets);
continue;
}
if (v.attribute_size > 1 && vertex_count >= (u32)g_cfg.video.mt_vertex_upload_threshold && rsxthr->vertex_upload_task_ready())
{
can_multithread = true;
size_t offset = m_attrib_ring_info.alloc<256>(upload_size);
memory_allocations.push_back(offset);
allocated_sizes.push_back(upload_size);
upload_jobs.push_back(i);
const uintptr_t local_addr = (uintptr_t)v.data.data();
rsxthr->m_vertex_cache->store_range(local_addr, format, upload_size, (u32)offset);
m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device, m_attrib_ring_info.heap->value, format, offset, upload_size));
m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets);
}
}
if (!can_multithread)
std::apply_visitor(visitor, vbo);
}
if (memory_allocations.size() > 0)
{
if (memory_allocations.size() > 1)
{
//2 sets in case the blocks dont fit
u8 available_jobs[2] = {};
u32 allocated_block[2] = {};
size_t last_offset = memory_allocations[0];
u8 current_index = 0;
for (int n = 0; n < memory_allocations.size(); ++n)
{
if (memory_allocations[n] < last_offset)
{
//queue went around
current_index = 1;
}
available_jobs[current_index] ++;
allocated_block[current_index] += allocated_sizes[n];
}
int n = 0;
for (int task = 0; task < 2; ++task)
{
if (available_jobs[task])
{
if (m_attrib_ring_info.mapped)
{
rsxthr->wait_for_vertex_upload_task();
m_attrib_ring_info.unmap();
}
size_t space_remaining = allocated_block[task];
size_t offset_base = memory_allocations[n];
gsl::byte* dst = (gsl::byte*)m_attrib_ring_info.map(memory_allocations[n], space_remaining);
while (true)
{
if (space_remaining == 0)
break;
const auto& vertex_array = vertex_buffers[upload_jobs[n]].get<rsx::vertex_array_buffer>();
const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
gsl::span<gsl::byte> dest_span(dst + (memory_allocations[n] - offset_base), allocated_sizes[n]);
rsxthr->post_vertex_stream_to_upload(vertex_array.data, dest_span, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size, vertex_count, vk::prepare_buffer_for_writing);
space_remaining -= allocated_sizes[n];
n++;
}
rsxthr->start_vertex_upload_task();
}
}
}
else
{
const size_t offset_in_attrib_buffer = memory_allocations[0];
const u32 upload_size = allocated_sizes[0];
const auto& vertex_array = vertex_buffers[upload_jobs[0]].get<rsx::vertex_array_buffer>();
const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
gsl::span<gsl::byte> dest_span(static_cast<gsl::byte*>(dst), upload_size);
write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size);
vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
m_attrib_ring_info.unmap();
}
}
for (auto &vbo: vertex_buffers)
std::apply_visitor(visitor, vbo);
}
u32 upload_inlined_array()

View file

@ -9,7 +9,7 @@ namespace rsx
f, ///< float
sf, ///< half float
ub, ///< unsigned byte interpreted as 0.f and 1.f
s32k, ///< signed 32bits int
s32k, ///< signed 16bits int
cmp, ///< compressed aka X11G11Z10 and always 1. W.
ub256, ///< unsigned byte interpreted as between 0 and 255.
};

View file

@ -459,20 +459,6 @@ namespace rsx
rsx->m_textures_dirty[index] = true;
}
};
template<u32 index>
struct set_vertex_array_dirty_bit
{
static void impl(thread* rsx, u32, u32)
{
rsx->m_vertex_attribs_changed = true;
}
};
void set_idbuf_dirty_bit(thread* rsx, u32, u32)
{
rsx->m_index_buffer_changed = true;
}
}
namespace nv308a
@ -1544,8 +1530,6 @@ namespace rsx
bind_range<NV4097_SET_TEXTURE_FILTER, 8, 16, nv4097::set_texture_dirty_bit>();
bind_range<NV4097_SET_TEXTURE_IMAGE_RECT, 8, 16, nv4097::set_texture_dirty_bit>();
bind_range<NV4097_SET_TEXTURE_BORDER_COLOR, 8, 16, nv4097::set_texture_dirty_bit>();
bind_range<NV4097_SET_VERTEX_DATA_ARRAY_OFFSET, 1, 16, nv4097::set_vertex_array_dirty_bit>();
bind<NV4097_SET_INDEX_ARRAY_ADDRESS, nv4097::set_idbuf_dirty_bit>();
bind<NV4097_SET_RENDER_ENABLE, nv4097::set_render_mode>();
bind<NV4097_SET_ZCULL_EN, nv4097::set_zcull_render_enable>();
bind<NV4097_SET_ZCULL_STATS_ENABLE, nv4097::set_zcull_stats_enable>();

View file

@ -60,8 +60,8 @@ public:
struct push_buffer_vertex_info
{
u8 size;
vertex_base_type type;
u8 size = 0;
vertex_base_type type = vertex_base_type::f;
u32 vertex_count = 0;
u32 attribute_mask = ~0;
@ -72,6 +72,7 @@ struct push_buffer_vertex_info
data.resize(0);
attribute_mask = ~0;
vertex_count = 0;
size = 0;
}
u8 get_vertex_size_in_dwords(vertex_base_type type)