From cdd9c12132d082bf4dcabdb70d641198a72395a3 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 10 Dec 2019 09:10:13 +0300 Subject: [PATCH] vk: Emulate conditional rendering for AMD --- rpcs3/Emu/RSX/RSXThread.cpp | 4 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 74 +++++++++++++++++++++++----- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 7 +++ rpcs3/Emu/RSX/VK/VKHelpers.h | 6 ++- rpcs3/Emu/RSX/VK/VKVertexProgram.cpp | 29 ++++++++++- rpcs3/Emu/RSX/VK/VKVertexProgram.h | 9 +++- 6 files changed, 109 insertions(+), 20 deletions(-) diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index f1f7e1d874..19adf61223 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -319,7 +319,7 @@ namespace rsx else { zcull_ctrl->read_barrier(this, cond_render_ctrl.eval_address, 4, reports::sync_no_notify); - cond_render_ctrl.eval_result(this); + verify(HERE), !cond_render_ctrl.eval_pending(); } } @@ -2959,8 +2959,6 @@ namespace rsx { if (hint || ptimer->async_tasks_pending >= max_safe_queue_depth) { - verify(HERE), !active || !hint; - // Prepare the whole queue for reading. This happens when zcull activity is disabled or queue is too long for (auto It = m_pending_writes.rbegin(); It != m_pending_writes.rend(); ++It) { diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 91171a9406..d99673404a 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -286,6 +286,13 @@ namespace idx++; + bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[idx].descriptorCount = 1; + bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[idx].binding = CONDITIONAL_RENDER_PREDICATE_SLOT; + + idx++; + for (int i = 0; i < rsx::limits::fragment_textures_count; i++) { bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; @@ -311,6 +318,12 @@ namespace push_constants[0].size = 16; push_constants[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + if (vk::emulate_conditional_rendering()) + { + // Conditional render toggle + push_constants[0].size = 20; + } + VkDescriptorSetLayoutCreateInfo infos = {}; infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; infos.pBindings = bindings.data(); @@ -439,11 +452,13 @@ VKGSRender::VKGSRender() : GSRender() m_occlusion_query_data[n].driver_handle = n; //Generate frame contexts - VkDescriptorPoolSize uniform_buffer_pool = { VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 6 * DESCRIPTOR_MAX_DRAW_CALLS }; - VkDescriptorPoolSize uniform_texel_pool = { VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER , 3 * DESCRIPTOR_MAX_DRAW_CALLS }; - VkDescriptorPoolSize texture_pool = { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , 20 * DESCRIPTOR_MAX_DRAW_CALLS }; + std::vector sizes; + sizes.push_back({ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 6 * DESCRIPTOR_MAX_DRAW_CALLS }); + sizes.push_back({ VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER , 3 * DESCRIPTOR_MAX_DRAW_CALLS }); + sizes.push_back({ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , 20 * DESCRIPTOR_MAX_DRAW_CALLS }); - std::vector sizes{ uniform_buffer_pool, uniform_texel_pool, texture_pool }; + // Conditional rendering predicate slot; refactor to allow skipping this when not needed + sizes.push_back({ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 * DESCRIPTOR_MAX_DRAW_CALLS }); VkSemaphoreCreateInfo semaphore_info = {}; semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; @@ -1153,7 +1168,7 @@ void VKGSRender::emit_geometry(u32 sub_index) update_draw_state(); begin_render_pass(); - if (cond_render_ctrl.hw_cond_active) + if (cond_render_ctrl.hw_cond_active && m_device->get_conditional_render_support()) { // It is inconvenient that conditional rendering breaks other things like compute dispatch // TODO: If this is heavy, add refactor the resources into global and add checks around compute dispatch @@ -2802,6 +2817,12 @@ void VKGSRender::load_program_env() m_program->bind_uniform(m_fragment_texture_params_buffer_info, FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, m_current_frame->descriptor_set); } + if (vk::emulate_conditional_rendering()) + { + auto predicate = m_cond_render_buffer ? m_cond_render_buffer->value : vk::get_scratch_buffer()->value; + m_program->bind_buffer({ predicate, 0, 4 }, CONDITIONAL_RENDER_PREDICATE_SLOT, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set); + } + //Clear flags const u32 handled_flags = (rsx::pipeline_state::fragment_state_dirty | rsx::pipeline_state::vertex_state_dirty | rsx::pipeline_state::transform_constants_dirty | rsx::pipeline_state::fragment_constants_dirty | rsx::pipeline_state::fragment_texture_state_dirty); m_graphics_state &= ~handled_flags; @@ -2826,13 +2847,21 @@ void VKGSRender::update_vertex_env(u32 id, const vk::vertex_upload_info& vertex_ base_offset = 0; } - u32 draw_info[4]; + u8 data_size = 16; + u32 draw_info[5]; + draw_info[0] = vertex_info.vertex_index_base; draw_info[1] = vertex_info.vertex_index_offset; draw_info[2] = id; draw_info[3] = (id * 16) + (base_offset / 8); - vkCmdPushConstants(*m_current_command_buffer, pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, 16, draw_info); + if (vk::emulate_conditional_rendering()) + { + draw_info[4] = cond_render_ctrl.hw_cond_active ? 1 : 0; + data_size = 20; + } + + vkCmdPushConstants(*m_current_command_buffer, pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, data_size, draw_info); const size_t data_offset = (id * 128) + m_vertex_layout_stream_info.offset; auto dst = m_vertex_layout_ring_info.map(data_offset, 128); @@ -3792,10 +3821,31 @@ void VKGSRender::begin_conditional_rendering(const std::vectorget_memory_mapping(); + auto usage_flags = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + if (m_device->get_conditional_render_support()) + { + usage_flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; + } + m_cond_render_buffer = std::make_unique( *m_device, 4, memory_props.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); + usage_flags, 0); + } + + VkPipelineStageFlags dst_stage; + VkAccessFlags dst_access; + + if (m_device->get_conditional_render_support()) + { + dst_stage = VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT; + dst_access = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT; + } + else + { + dst_stage = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT; + dst_access = VK_ACCESS_SHADER_READ_BIT; } if (sources.size() == 1) @@ -3809,8 +3859,8 @@ void VKGSRender::begin_conditional_rendering(const std::vectorvalue, 0); vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT); + VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage, + VK_ACCESS_TRANSFER_WRITE_BIT, dst_access); rsx::thread::begin_conditional_rendering(sources); return; @@ -3863,8 +3913,8 @@ void VKGSRender::begin_conditional_rendering(const std::vector()->run(*m_current_command_buffer, m_cond_render_buffer.get(), scratch, dst_offset / 4); vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT); + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, dst_stage, + VK_ACCESS_SHADER_WRITE_BIT, dst_access); } else { diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index 9ff3df1e4f..0492f7c5b0 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -90,6 +90,7 @@ namespace vk bool g_drv_no_primitive_restart_flag = false; bool g_drv_sanitize_fp_values = false; bool g_drv_disable_fence_reset = false; + bool g_drv_emulate_cond_render = false; u64 g_num_processed_frames = 0; u64 g_num_total_frames = 0; @@ -425,6 +426,7 @@ namespace vk g_drv_no_primitive_restart_flag = false; g_drv_sanitize_fp_values = false; g_drv_disable_fence_reset = false; + g_drv_emulate_cond_render = (g_cfg.video.relaxed_zcull_sync && !g_current_renderer->get_conditional_render_support()); g_num_processed_frames = 0; g_num_total_frames = 0; g_heap_compatible_buffer_types = 0; @@ -533,6 +535,11 @@ namespace vk return g_drv_disable_fence_reset; } + bool emulate_conditional_rendering() + { + return g_drv_emulate_cond_render; + } + void insert_buffer_memory_barrier(VkCommandBuffer cmd, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize length, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src_mask, VkAccessFlags dst_mask) { VkBufferMemoryBarrier barrier = {}; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index a8e4df589d..7ca143fe35 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -39,8 +39,9 @@ #define FRAGMENT_STATE_BIND_SLOT 3 #define FRAGMENT_TEXTURE_PARAMS_BIND_SLOT 4 #define VERTEX_BUFFERS_FIRST_BIND_SLOT 5 -#define TEXTURES_FIRST_BIND_SLOT 8 -#define VERTEX_TEXTURES_FIRST_BIND_SLOT 24 //8+16 +#define CONDITIONAL_RENDER_PREDICATE_SLOT 8 +#define TEXTURES_FIRST_BIND_SLOT 9 +#define VERTEX_TEXTURES_FIRST_BIND_SLOT (TEXTURES_FIRST_BIND_SLOT + 16) #define VK_NUM_DESCRIPTOR_BINDINGS (VERTEX_TEXTURES_FIRST_BIND_SLOT + 4) @@ -138,6 +139,7 @@ namespace vk bool emulate_primitive_restart(rsx::primitive_type type); bool sanitize_fp_values(); bool fence_reset_disabled(); + bool emulate_conditional_rendering(); VkFlags get_heap_compatible_buffer_types(); driver_vendor get_driver_vendor(); chip_class get_chip_family(uint32_t vendor_id, uint32_t device_id); diff --git a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp index 8fdfa4f50c..9a2bba4056 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp @@ -43,12 +43,26 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS) OS << " float z_far;\n"; OS << "};\n\n"; + if (m_device_props.emulate_conditional_rendering) + { + OS << "layout(std430, set = 0, binding = 8) readonly buffer EXT_Conditional_Rendering\n"; + OS << "{\n"; + OS << " uint conditional_rendering_predicate;\n"; + OS << "};\n\n"; + } + OS << "layout(push_constant) uniform VertexLayoutBuffer\n"; OS << "{\n"; OS << " uint vertex_base_index;\n"; OS << " uint vertex_index_offset;\n"; OS << " uint draw_id;\n"; OS << " uint layout_ptr_offset;\n"; + + if (m_device_props.emulate_conditional_rendering) + { + OS << " uint conditional_rendering_enabled;\n"; + } + OS << "};\n\n"; vk::glsl::program_input in; @@ -238,9 +252,18 @@ void VKVertexDecompilerThread::insertMainEnd(std::stringstream & OS) OS << "}\n\n"; OS << "void main ()\n"; - OS << "{\n"; + OS << "{\n\n"; - OS << "\n" << " vs_main();\n\n"; + if (m_device_props.emulate_conditional_rendering) + { + OS << " if (conditional_rendering_enabled != 0 && conditional_rendering_predicate == 0)\n"; + OS << " {\n"; + OS << " gl_Position = vec4(0.);\n"; + OS << " return;\n"; + OS << "}\n\n"; + } + + OS << " vs_main();\n\n"; for (auto &i : reg_table) { @@ -286,6 +309,8 @@ void VKVertexDecompilerThread::insertMainEnd(std::stringstream & OS) void VKVertexDecompilerThread::Task() { + m_device_props.emulate_conditional_rendering = vk::emulate_conditional_rendering(); + m_shader = Decompile(); vk_prog->SetInputs(inputs); } diff --git a/rpcs3/Emu/RSX/VK/VKVertexProgram.h b/rpcs3/Emu/RSX/VK/VKVertexProgram.h index 33be96ed76..32f6f66740 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexProgram.h +++ b/rpcs3/Emu/RSX/VK/VKVertexProgram.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "../Common/VertexProgramDecompiler.h" #include "Emu/RSX/RSXVertexProgram.h" #include "Utilities/Thread.h" @@ -10,6 +10,13 @@ struct VKVertexDecompilerThread : public VertexProgramDecompiler std::string &m_shader; std::vector inputs; class VKVertexProgram *vk_prog; + + struct + { + bool emulate_conditional_rendering; + } + m_device_props; + protected: std::string getFloatTypeName(size_t elementCount) override; std::string getIntTypeName(size_t elementCount) override;