diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 327b5328c9..f1f7e1d874 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -314,7 +314,7 @@ namespace rsx verify(HERE), !cond_render_ctrl.hw_cond_active; // Pending evaluation, use hardware test - begin_conditional_rendering(); + begin_conditional_rendering(cond_render_ctrl.eval_sources); } else { @@ -2158,13 +2158,13 @@ namespace rsx { cond_render_ctrl.enable_conditional_render(this, ref); - auto result = zcull_ctrl->find_query(ref); + auto result = zcull_ctrl->find_query(ref, true); if (result.found) { - if (result.query) + if (!result.queries.empty()) { - cond_render_ctrl.set_sync_tag(result.query->sync_tag); - sync_hint(FIFO_hint::hint_conditional_render_eval, result.query); + cond_render_ctrl.set_eval_sources(result.queries); + sync_hint(FIFO_hint::hint_conditional_render_eval, cond_render_ctrl.eval_sources.front()); } else { @@ -2183,9 +2183,10 @@ namespace rsx cond_render_ctrl.disable_conditional_render(this); } - void thread::begin_conditional_rendering() + void thread::begin_conditional_rendering(const std::vector& /*sources*/) { cond_render_ctrl.hw_cond_active = true; + cond_render_ctrl.eval_sources.clear(); } void thread::end_conditional_rendering() @@ -2709,6 +2710,12 @@ namespace rsx } ptimer->async_tasks_pending++; + + if (m_statistics_map[m_statistics_tag_id] != 0) + { + // Flush guaranteed results; only one positive is needed + update(ptimer); + } } void ZCULL_control::allocate_new_query(::rsx::thread* ptimer) @@ -2888,7 +2895,7 @@ namespace rsx // No other queries in the chain, write result write(&writer, ptimer->timestamp(), result); - if (query && ptimer->cond_render_ctrl.sync_tag == query->sync_tag) + if (query && query->sync_tag == ptimer->cond_render_ctrl.eval_sync_tag) { const bool eval_failed = (result == 0); ptimer->cond_render_ctrl.set_eval_result(ptimer, eval_failed); @@ -3083,7 +3090,7 @@ namespace rsx // No other queries in the chain, write result write(&writer, ptimer->timestamp(), result); - if (query && ptimer->cond_render_ctrl.sync_tag == query->sync_tag) + if (query && query->sync_tag == ptimer->cond_render_ctrl.eval_sync_tag) { const bool eval_failed = (result == 0); ptimer->cond_render_ctrl.set_eval_result(ptimer, eval_failed); @@ -3175,36 +3182,56 @@ namespace rsx return result_zcull_intr; } - query_search_result ZCULL_control::find_query(vm::addr_t sink_address) + query_search_result ZCULL_control::find_query(vm::addr_t sink_address, bool all) { + query_search_result result{}; u32 stat_id = 0; + for (auto It = m_pending_writes.crbegin(); It != m_pending_writes.crend(); ++It) { if (UNLIKELY(stat_id)) { if (It->counter_tag != stat_id) { - // Zcull stats were cleared between this query and the required one - return { true, 0, nullptr }; + if (result.found) + { + // Some result was found, return it instead + break; + } + + // Zcull stats were cleared between this query and the required stats, result can only be 0 + return { true, 0, {} }; } - if (It->query) + if (It->query && It->query->num_draws) { - return { true, 0, It->query }; + result.found = true; + result.queries.push_back(It->query); + + if (!all) + { + break; + } } } else if (It->sink == sink_address) { - if (It->query) + if (It->query && It->query->num_draws) { - return { true, 0, It->query }; + result.found = true; + result.queries.push_back(It->query); + + if (!all) + { + break; + } } stat_id = It->counter_tag; } } - return {}; + return result; } u32 ZCULL_control::copy_reports_to(u32 start, u32 range, u32 dest) @@ -3228,6 +3255,15 @@ namespace rsx // Conditional rendering helpers + void conditional_render_eval::reset() + { + eval_address = 0; + eval_sync_tag = 0; + eval_sources.clear(); + + eval_failed = false; + } + bool conditional_render_eval::disable_rendering() const { return (enabled && eval_failed); @@ -3246,10 +3282,10 @@ namespace rsx pthr->end_conditional_rendering(); } + reset(); + enabled = true; - eval_failed = false; eval_address = address; - sync_tag = 0; } void conditional_render_eval::disable_conditional_render(::rsx::thread* pthr) @@ -3260,15 +3296,14 @@ namespace rsx pthr->end_conditional_rendering(); } + reset(); enabled = false; - eval_failed = false; - eval_address = 0; - sync_tag = 0; } - void conditional_render_eval::set_sync_tag(u64 value) + void conditional_render_eval::set_eval_sources(std::vector& sources) { - sync_tag = value; + eval_sources = std::move(sources); + eval_sync_tag = eval_sources.front()->sync_tag; } void conditional_render_eval::set_eval_result(::rsx::thread* pthr, bool failed) @@ -3279,9 +3314,8 @@ namespace rsx pthr->end_conditional_rendering(); } + reset(); eval_failed = failed; - eval_address = 0; - sync_tag = 0; } void conditional_render_eval::eval_result(::rsx::thread* pthr) diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 7306dfca5e..9663111263 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -362,7 +362,7 @@ namespace rsx { bool found; u32 raw_zpass_result; - occlusion_query_info* query; + std::vector queries; }; enum sync_control @@ -443,7 +443,7 @@ namespace rsx bool has_pending() const { return !m_pending_writes.empty(); } // Search for query synchronized at address - query_search_result find_query(vm::addr_t sink_address); + query_search_result find_query(vm::addr_t sink_address, bool all); // Copies queries in range rebased from source range to destination range u32 copy_reports_to(u32 start, u32 range, u32 dest); @@ -463,8 +463,13 @@ namespace rsx bool eval_failed = false; bool hw_cond_active = false; bool reserved = false; - u32 eval_address = 0; - u64 sync_tag = 0; + + std::vector eval_sources; + u32 eval_sync_tag = 0; + u32 eval_address = 0; + + // Resets common data + void reset(); // Returns true if rendering is disabled as per conditional render test bool disable_rendering() const; @@ -478,8 +483,8 @@ namespace rsx // Disable conditional rendering void disable_conditional_render(thread* pthr); - // Sets up the zcull sync tag - void set_sync_tag(u64 value); + // Sets data sources for predicate evaluation + void set_eval_sources(std::vector& sources); // Sets evaluation result. Result is true if conditional evaluation failed void set_eval_result(thread* pthr, bool failed); @@ -765,7 +770,7 @@ namespace rsx void enable_conditional_rendering(vm::addr_t ref); void disable_conditional_rendering(); - virtual void begin_conditional_rendering(); + virtual void begin_conditional_rendering(const std::vector& sources); virtual void end_conditional_rendering(); // sync diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index dc48cd343b..39d6415f8c 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -784,6 +784,60 @@ namespace vk } }; + struct cs_aggregator : compute_task + { + const buffer* src = nullptr; + const buffer* dst = nullptr; + u32 block_length = 0; + u32 word_count = 0; + + cs_aggregator() + { + ssbo_count = 2; + + create(); + + m_src = + "#version 450\n" + "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" + + "layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n" + "layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n" + + "void main()\n" + "{\n" + " if (gl_GlobalInvocationID.x < src.length())\n" + " {\n" + " atomicAdd(result, src[gl_GlobalInvocationID.x]);\n" + " }\n" + "}\n"; + + const std::pair syntax_replace[] = + { + { "%ws", std::to_string(optimal_group_size) }, + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + } + + void bind_resources() override + { + m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) + { + this->dst = dst; + this->src = src; + word_count = num_words; + block_length = num_words * 4; + + const u32 linear_invocations = aligned_div(word_count, optimal_group_size); + compute_task::run(cmd, linear_invocations); + } + }; + // TODO: Replace with a proper manager extern std::unordered_map> g_compute_tasks; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 7478a83bb9..91171a9406 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -643,6 +643,7 @@ VKGSRender::~VKGSRender() //Queries m_occlusion_query_pool.destroy(); + m_cond_render_buffer.reset(); //Command buffer for (auto &cb : m_primary_cb_list) @@ -1151,6 +1152,18 @@ void VKGSRender::emit_geometry(u32 sub_index) vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline); update_draw_state(); begin_render_pass(); + + if (cond_render_ctrl.hw_cond_active) + { + // It is inconvenient that conditional rendering breaks other things like compute dispatch + // TODO: If this is heavy, add refactor the resources into global and add checks around compute dispatch + VkConditionalRenderingBeginInfoEXT info{}; + info.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; + info.buffer = m_cond_render_buffer->value; + + m_device->cmdBeginConditionalRenderingEXT(*m_current_command_buffer, &info); + m_current_command_buffer->flags |= vk::command_buffer::cb_has_conditional_render; + } } // Bind the new set of descriptors for use with this draw call @@ -1787,6 +1800,12 @@ void VKGSRender::end() } while (rsx::method_registers.current_draw_clause.next()); + if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render) + { + m_device->cmdEndConditionalRenderingEXT(*m_current_command_buffer); + m_current_command_buffer->flags &= ~(vk::command_buffer::cb_has_conditional_render); + } + // Close any open passes unconditionally close_render_pass(); @@ -2702,7 +2721,7 @@ void VKGSRender::load_program_env() // Vertex state const auto mem = m_vertex_env_ring_info.alloc<256>(256); - auto buf = static_cast(m_vertex_env_ring_info.map(mem, 144)); + auto buf = static_cast(m_vertex_env_ring_info.map(mem, 148)); fill_scale_offset_data(buf, false); fill_user_clip_data(buf + 64); @@ -2866,6 +2885,14 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore vk::clear_status_interrupt(vk::heap_dirty); } +#if 0 // Currently unreachable + if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render) + { + verify(HERE), m_render_pass_open; + m_device->cmdEndConditionalRenderingEXT(*m_current_command_buffer); + } +#endif + // End any active renderpasses; the caller should handle reopening if (m_render_pass_open) { @@ -3689,7 +3716,7 @@ void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* busy_wait(); } - data.command_buffer_to_wait->wait(); + data.command_buffer_to_wait->flush(); // Gather data for (const auto occlusion_id : data.indices) @@ -3734,6 +3761,124 @@ void VKGSRender::emergency_query_cleanup(vk::command_buffer* commands) } } +void VKGSRender::begin_conditional_rendering(const std::vector& sources) +{ + verify(HERE), !sources.empty(); + + // Flag check whether to calculate all entries or only one + bool partial_eval; + + // Try and avoid regenerating the data if its a repeat/spam + // NOTE: The incoming list is reversed with the first entry being the newest + if (m_cond_render_sync_tag == sources.front()->sync_tag) + { + // Already synched, check subdraw which is possible if last sync happened while query was active + if (!m_active_query_info || m_active_query_info != sources.front()) + { + rsx::thread::begin_conditional_rendering(sources); + return; + } + + // Partial evaluation only + partial_eval = true; + } + else + { + m_cond_render_sync_tag = sources.front()->sync_tag; + partial_eval = false; + } + + // Time to aggregate + if (!m_cond_render_buffer) + { + auto& memory_props = m_device->get_memory_mapping(); + m_cond_render_buffer = std::make_unique( + *m_device, 4, + memory_props.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); + } + + if (sources.size() == 1) + { + const auto query = sources.front(); + const auto& query_info = m_occlusion_map[query->driver_handle]; + + if (query_info.indices.size() == 1) + { + const auto& index = query_info.indices.front(); + m_occlusion_query_pool.get_query_result_indirect(*m_current_command_buffer, index, m_cond_render_buffer->value, 0); + + vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT); + + rsx::thread::begin_conditional_rendering(sources); + return; + } + } + + auto scratch = vk::get_scratch_buffer(); + u32 dst_offset = 0; + size_t first = 0; + size_t last; + + if (LIKELY(!partial_eval)) + { + last = sources.size(); + } + else + { + last = 1; + } + + for (size_t i = first; i < last; ++i) + { + auto& query_info = m_occlusion_map[sources[i]->driver_handle]; + for (const auto& index : query_info.indices) + { + m_occlusion_query_pool.get_query_result_indirect(*m_current_command_buffer, index, scratch->value, dst_offset); + dst_offset += 4; + } + } + + if (dst_offset) + { + // Fast path should have been caught above + verify(HERE), dst_offset > 4; + + if (!partial_eval) + { + // Clear result to zero + vkCmdFillBuffer(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, 0); + + vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); + } + + vk::insert_buffer_memory_barrier(*m_current_command_buffer, scratch->value, 0, dst_offset, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + vk::get_compute_task()->run(*m_current_command_buffer, m_cond_render_buffer.get(), scratch, dst_offset / 4); + + vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT); + } + else + { + LOG_ERROR(RSX, "Dubious query data pushed to cond render!, Please report to developers(q.pending=%d)", sources.front()->pending); + } + + rsx::thread::begin_conditional_rendering(sources); +} + +void VKGSRender::end_conditional_rendering() +{ + thread::end_conditional_rendering(); +} + bool VKGSRender::on_decompiler_task() { return m_prog_buffer->async_update(8, *m_device, pipeline_layout).first; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 403ef76de4..f3999249f5 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -321,6 +321,9 @@ private: std::unique_ptr m_attachment_clear_pass; std::unique_ptr m_video_output_pass; + std::unique_ptr m_cond_render_buffer; + u64 m_cond_render_sync_tag = 0; + shared_mutex m_sampler_mutex; u64 surface_store_tag = 0; std::atomic_bool m_samplers_dirty = { true }; @@ -479,6 +482,10 @@ public: // External callback in case we need to suddenly submit a commandlist unexpectedly, e.g in a violation handler void emergency_query_cleanup(vk::command_buffer* commands); + // Conditional rendering + void begin_conditional_rendering(const std::vector& sources) override; + void end_conditional_rendering() override; + protected: void clear_surface(u32 mask) override; void begin() override; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 1fdff8681b..a8e4df589d 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -545,6 +545,8 @@ namespace vk gpu_shader_types_support shader_types_support{}; VkPhysicalDeviceDriverPropertiesKHR driver_properties{}; bool stencil_export_support = false; + bool conditional_render_support = false; + bool host_query_reset_support = false; friend class render_device; private: @@ -594,6 +596,8 @@ private: } stencil_export_support = device_extensions.is_supported(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME); + conditional_render_support = device_extensions.is_supported(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME); + host_query_reset_support = device_extensions.is_supported(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); } public: @@ -764,6 +768,12 @@ private: std::unique_ptr m_allocator; VkDevice dev = VK_NULL_HANDLE; + public: + // Exported device endpoints + PFN_vkCmdBeginConditionalRenderingEXT cmdBeginConditionalRenderingEXT = nullptr; + PFN_vkCmdEndConditionalRenderingEXT cmdEndConditionalRenderingEXT = nullptr; + PFN_vkResetQueryPoolEXT resetQueryPoolEXT = nullptr; + public: render_device() = default; ~render_device() = default; @@ -797,6 +807,16 @@ private: requested_extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } + if (pgpu->conditional_render_support) + { + requested_extensions.push_back(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME); + } + + if (pgpu->host_query_reset_support) + { + requested_extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); + } + enabled_features.robustBufferAccess = VK_TRUE; enabled_features.fullDrawIndexUint32 = VK_TRUE; enabled_features.independentBlend = VK_TRUE; @@ -880,6 +900,18 @@ private: CHECK_RESULT(vkCreateDevice(*pgpu, &device, nullptr, &dev)); + // Import optional function endpoints + if (pgpu->conditional_render_support) + { + cmdBeginConditionalRenderingEXT = (PFN_vkCmdBeginConditionalRenderingEXT)vkGetDeviceProcAddr(dev, "vkCmdBeginConditionalRenderingEXT"); + cmdEndConditionalRenderingEXT = (PFN_vkCmdEndConditionalRenderingEXT)vkGetDeviceProcAddr(dev, "vkCmdEndConditionalRenderingEXT"); + } + + if (pgpu->host_query_reset_support) + { + resetQueryPoolEXT = (PFN_vkResetQueryPoolEXT)vkGetDeviceProcAddr(dev, "vkResetQueryPoolEXT"); + } + memory_map = vk::get_memory_mapping(pdev); m_formats_support = vk::get_optimal_tiling_supported_formats(pdev); @@ -979,6 +1011,16 @@ private: return pgpu->features.alphaToOne != VK_FALSE; } + bool get_conditional_render_support() const + { + return pgpu->conditional_render_support; + } + + bool get_host_query_reset_support() const + { + return pgpu->host_query_reset_support; + } + mem_allocator_base* get_allocator() const { return m_allocator.get(); @@ -1097,7 +1139,8 @@ private: cb_has_blit_transfer = 2, cb_has_dma_transfer = 4, cb_has_open_query = 8, - cb_load_occluson_task = 16 + cb_load_occluson_task = 16, + cb_has_conditional_render = 32 }; u32 flags = 0; @@ -3045,6 +3088,11 @@ public: while (true); } + void get_query_result_indirect(vk::command_buffer &cmd, u32 index, VkBuffer dst, VkDeviceSize dst_offset) + { + vkCmdCopyQueryPoolResults(cmd, query_pool, index, 1, dst, dst_offset, 4, VK_QUERY_RESULT_WAIT_BIT); + } + void reset_query(vk::command_buffer &cmd, u32 index) { if (query_active_status[index])