From 315798b1f48cffeda8df6344f4f4617e9fd82ac7 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Mon, 5 Mar 2018 14:09:43 +0300
Subject: [PATCH] rsx: ZCULL rewrite and other improvements - ZCULL unit
 emulation rewritten - ZCULL reports are now deferred avoiding pipeline stalls
 - Minor optimizations; replaced std::mutex with shared_mutex where contention
 is rare - Silence unnecessary error message - Small improvement to out of
 memory handling for vulkan and slightly bump vertex buffer heap

---
 rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.cpp |   8 +-
 rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.h   |   2 +-
 rpcs3/Emu/RSX/GL/GLGSRender.cpp            |  40 +-
 rpcs3/Emu/RSX/GL/GLGSRender.h              |  14 +-
 rpcs3/Emu/RSX/GL/GLRenderTargets.cpp       |   6 +-
 rpcs3/Emu/RSX/RSXThread.cpp                | 574 ++++++++++++++++-----
 rpcs3/Emu/RSX/RSXThread.h                  | 152 +++---
 rpcs3/Emu/RSX/VK/VKGSRender.cpp            | 163 +++---
 rpcs3/Emu/RSX/VK/VKGSRender.h              |  28 +-
 rpcs3/Emu/RSX/VK/VKHelpers.cpp             |   4 +-
 rpcs3/Emu/RSX/rsx_methods.cpp              |  27 +-
 11 files changed, 697 insertions(+), 321 deletions(-)
diff --git a/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.cpp b/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.cpp
index 6c5567ec9a..92dce919e5 100644
--- a/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.cpp
+++ b/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.cpp
@@ -7,7 +7,7 @@
 
 void data_cache::store_and_protect_data(u64 key, u32 start, size_t size, u8 format, size_t w, size_t h, size_t d, size_t m, ComPtr<ID3D12Resource> data)
 {
-	std::lock_guard<std::mutex> lock(m_mut);
+	std::lock_guard<shared_mutex> lock(m_mut);
 	m_address_to_data[key] = std::make_pair(texture_entry(format, w, h, d, m), data);
 	protect_data(key, start, size);
 }
@@ -25,7 +25,7 @@ void data_cache::protect_data(u64 key, u32 start, size_t size)
 bool data_cache::invalidate_address(u32 addr)
 {
 	// In case 2 threads write to texture memory
-	std::lock_guard<std::mutex> lock(m_mut);
+	std::lock_guard<shared_mutex> lock(m_mut);
 	bool handled = false;
 	auto It = m_protected_ranges.begin(), E = m_protected_ranges.end();
 	for (; It != E;)
@@ -49,7 +49,7 @@ bool data_cache::invalidate_address(u32 addr)
 
 std::pair<texture_entry, ComPtr<ID3D12Resource> > *data_cache::find_data_if_available(u64 key)
 {
-	std::lock_guard<std::mutex> lock(m_mut);
+	std::lock_guard<shared_mutex> lock(m_mut);
 	auto It = m_address_to_data.find(key);
 	if (It == m_address_to_data.end())
 		return nullptr;
@@ -58,7 +58,7 @@ std::pair<texture_entry, ComPtr<ID3D12Resource> > *data_cache::find_data_if_avai
 
 void data_cache::unprotect_all()
 {
-	std::lock_guard<std::mutex> lock(m_mut);
+	std::lock_guard<shared_mutex> lock(m_mut);
 	for (auto &protectedTexture : m_protected_ranges)
 	{
 		u32 protectedRangeStart = std::get<1>(protectedTexture), protectedRangeSize = std::get<2>(protectedTexture);
diff --git a/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.h b/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.h
index d9d58adc38..10c8b5c7ad 100644
--- a/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.h
+++ b/rpcs3/Emu/RSX/D3D12/D3D12MemoryHelpers.h
@@ -98,7 +98,7 @@ private:
 	* Memory protection fault catch can be generated by any thread and
 	* modifies it.
 	*/
-	std::mutex m_mut;
+	shared_mutex m_mut;
 
 	std::unordered_map<u64, std::pair<texture_entry, ComPtr<ID3D12Resource>> > m_address_to_data; // Storage
 	std::list <std::tuple<u64, u32, u32> > m_protected_ranges; // address, start of protected range, size of protected range
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 20e7ee016d..909ad66bdf 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -209,7 +209,7 @@ void GLGSRender::end()
 	{
 		std::chrono::time_point<steady_clock> textures_start = steady_clock::now();
 
-		std::lock_guard<std::mutex> lock(m_sampler_mutex);
+		std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 		void* unused = nullptr;
 		bool  update_framebuffer_sourced = false;
 
@@ -598,6 +598,7 @@ void GLGSRender::set_viewport()
 void GLGSRender::on_init_thread()
 {
 	GSRender::on_init_thread();
+	zcull_ctrl.reset(static_cast<::rsx::reports::ZCULL_control*>(this));
 
 	gl::init();
 
@@ -768,7 +769,7 @@ void GLGSRender::on_init_thread()
 	for (u32 i = 0; i < occlusion_query_count; ++i)
 	{
 		GLuint handle = 0;
-		auto &query = occlusion_query_data[i];
+		auto &query = m_occlusion_query_data[i];
 		glGenQueries(1, &handle);
 		
 		query.driver_handle = (u64)handle;
@@ -853,6 +854,8 @@ void GLGSRender::on_init_thread()
 
 void GLGSRender::on_exit()
 {
+	zcull_ctrl.release();
+
 	m_prog_buffer.clear();
 
 	if (draw_fbo)
@@ -920,7 +923,7 @@ void GLGSRender::on_exit()
 
 	for (u32 i = 0; i < occlusion_query_count; ++i)
 	{
-		auto &query = occlusion_query_data[i];
+		auto &query = m_occlusion_query_data[i];
 		query.active = false;
 		query.pending = false;
 
@@ -1424,7 +1427,7 @@ bool GLGSRender::on_access_violation(u32 address, bool is_writing)
 		return false;
 
 	{
-		std::lock_guard<std::mutex> lock(m_sampler_mutex);
+		std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 		m_samplers_dirty.store(true);
 	}
 
@@ -1452,7 +1455,7 @@ void GLGSRender::on_notify_memory_unmapped(u32 address_base, u32 size)
 	{
 		m_gl_texture_cache.purge_dirty();
 		{
-			std::lock_guard<std::mutex> lock(m_sampler_mutex);
+			std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 			m_samplers_dirty.store(true);
 		}
 	}
@@ -1464,7 +1467,7 @@ void GLGSRender::do_local_task(bool /*idle*/)
 
 	if (!work_queue.empty())
 	{
-		std::lock_guard<std::mutex> lock(queue_guard);
+		std::lock_guard<shared_mutex> lock(queue_guard);
 
 		work_queue.remove_if([](work_item &q) { return q.received; });
 
@@ -1505,7 +1508,7 @@ void GLGSRender::do_local_task(bool /*idle*/)
 
 work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::thrashed_set& flush_data)
 {
-	std::lock_guard<std::mutex> lock(queue_guard);
+	std::lock_guard<shared_mutex> lock(queue_guard);
 
 	work_queue.emplace_back();
 	work_item &result = work_queue.back();
@@ -1537,31 +1540,38 @@ void GLGSRender::notify_tile_unbound(u32 tile)
 	//m_rtts.invalidate_surface_address(addr, false);
 }
 
-void GLGSRender::begin_occlusion_query(rsx::occlusion_query_info* query)
+void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
 	query->result = 0;
 	glBeginQuery(GL_ANY_SAMPLES_PASSED, (GLuint)query->driver_handle);
 }
 
-void GLGSRender::end_occlusion_query(rsx::occlusion_query_info* query)
+void GLGSRender::end_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
-	glEndQuery(GL_ANY_SAMPLES_PASSED);
+	if (query->num_draws)
+		glEndQuery(GL_ANY_SAMPLES_PASSED);
 }
 
-bool GLGSRender::check_occlusion_query_status(rsx::occlusion_query_info* query)
+bool GLGSRender::check_occlusion_query_status(rsx::reports::occlusion_query_info* query)
 {
+	if (!query->num_draws)
+		return true;
+
 	GLint status = GL_TRUE;
 	glGetQueryObjectiv((GLuint)query->driver_handle, GL_QUERY_RESULT_AVAILABLE, &status);
 
 	return status != GL_FALSE;
 }
 
-void GLGSRender::get_occlusion_query_result(rsx::occlusion_query_info* query)
+void GLGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* query)
 {
-	GLint result;
-	glGetQueryObjectiv((GLuint)query->driver_handle, GL_QUERY_RESULT, &result);
+	if (query->num_draws)
+	{
+		GLint result;
+		glGetQueryObjectiv((GLuint)query->driver_handle, GL_QUERY_RESULT, &result);
 
-	query->result += result;
+		query->result += result;
+	}
 }
 
 void GLGSRender::shell_do_cleanup()
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h
index e19a707942..c853d5fded 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@@ -265,7 +265,7 @@ struct driver_state
 	}
 };
 
-class GLGSRender : public GSRender
+class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 {
 private:
 	GLFragmentProgram m_fragment_prog;
@@ -311,7 +311,7 @@ private:
 
 	std::vector<u64> m_overlay_cleanup_requests;
 
-	std::mutex queue_guard;
+	shared_mutex queue_guard;
 	std::list<work_item> work_queue;
 
 	bool flush_draw_buffers = false;
@@ -327,7 +327,7 @@ private:
 	//vaos are mandatory for core profile
 	gl::vao m_vao;
 
-	std::mutex m_sampler_mutex;
+	shared_mutex m_sampler_mutex;
 	u64 surface_store_tag = 0;
 	std::atomic_bool m_samplers_dirty = {true};
 	std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, rsx::limits::fragment_textures_count> fs_sampler_state = {};
@@ -363,10 +363,10 @@ public:
 
 	bool scaled_image_from_memory(rsx::blit_src_info& src_info, rsx::blit_dst_info& dst_info, bool interpolate) override;
 
-	void begin_occlusion_query(rsx::occlusion_query_info* query) override;
-	void end_occlusion_query(rsx::occlusion_query_info* query) override;
-	bool check_occlusion_query_status(rsx::occlusion_query_info* query) override;
-	void get_occlusion_query_result(rsx::occlusion_query_info* query) override;
+	void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
+	void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
+	bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override;
+	void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
 
 protected:
 	void begin() override;
diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
index cc4230a46e..2c2b169e71 100644
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
@@ -318,8 +318,8 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 		}
 	}
 
-	if ((window_clip_width && window_clip_width != clip_horizontal) ||
-		(window_clip_height && window_clip_height != clip_vertical))
+	if ((window_clip_width && window_clip_width < clip_horizontal) ||
+		(window_clip_height && window_clip_height < clip_vertical))
 	{
 		LOG_ERROR(RSX, "Unexpected window clip dimensions: window_clip=%dx%d, surface_clip=%dx%d",
 			window_clip_width, window_clip_height, clip_horizontal, clip_vertical);
@@ -428,7 +428,7 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 	framebuffer_status_valid = draw_fbo.check();
 	if (!framebuffer_status_valid) return;
 
-	check_zcull_status(true, false);
+	check_zcull_status(true);
 	set_viewport();
 
 	switch (rsx::method_registers.surface_color_target())
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 49cbb909e3..e5f8008569 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -349,8 +349,8 @@ namespace rsx
 
 		element_push_buffer.resize(0);
 
-		if (zcull_task_queue.active_query && zcull_task_queue.active_query->active)
-			zcull_task_queue.active_query->num_draws++;
+		if (zcull_ctrl->active)
+			zcull_ctrl->on_draw();
 
 		if (capture_current_frame)
 		{
@@ -365,6 +365,12 @@ namespace rsx
 
 		reset();
 
+		if (!zcull_ctrl)
+		{
+			//Backend did not provide an implementation, provide NULL object
+			zcull_ctrl = std::make_unique<::rsx::reports::ZCULL_control>();
+		}
+
 		last_flip_time = get_system_time() - 1000000;
 
 		thread_ctrl::spawn(m_vblank_thread, "VBlank Thread", [this]()
@@ -503,6 +509,9 @@ namespace rsx
 			//Execute backend-local tasks first
 			do_local_task(ctrl->put.load() == internal_get.load());
 
+			//Update sub-units
+			zcull_ctrl->update(this);
+
 			//Set up restore state if needed
 			if (sync_point_request)
 			{
@@ -1140,6 +1149,12 @@ namespace rsx
 
 	void thread::do_internal_task()
 	{
+		if (zcull_ctrl->has_pending())
+		{
+			zcull_ctrl->sync(this);
+			return;
+		}
+
 		if (m_internal_tasks.empty())
 		{
 			std::this_thread::yield();
@@ -1147,7 +1162,7 @@ namespace rsx
 		else
 		{
 			fmt::throw_exception("Disabled" HERE);
-			//std::lock_guard<std::mutex> lock{ m_mtx_task };
+			//std::lock_guard<shared_mutex> lock{ m_mtx_task };
 
 			//internal_task_entry &front = m_internal_tasks.front();
 
@@ -1161,7 +1176,7 @@ namespace rsx
 
 	//std::future<void> thread::add_internal_task(std::function<bool()> callback)
 	//{
-	//	std::lock_guard<std::mutex> lock{ m_mtx_task };
+	//	std::lock_guard<shared_mutex> lock{ m_mtx_task };
 	//	m_internal_tasks.emplace_back(callback);
 
 	//	return m_internal_tasks.back().promise.get_future();
@@ -2075,10 +2090,20 @@ namespace rsx
 			skip_frame = (m_skip_frame_ctr < 0);
 		}
 
+		//Reset zcull ctrl
+		zcull_ctrl->set_active(this, false);
+		zcull_ctrl->clear();
+
+		if (zcull_ctrl->has_pending())
+		{
+			LOG_ERROR(RSX, "Dangling reports found, discarding...");
+			zcull_ctrl->sync(this);
+		}
+
 		performance_counters.sampled_frames++;
 	}
 
-	void thread::check_zcull_status(bool framebuffer_swap, bool force_read)
+	void thread::check_zcull_status(bool framebuffer_swap)
 	{
 		if (g_cfg.video.disable_zcull_queries)
 			return;
@@ -2108,35 +2133,8 @@ namespace rsx
 			}
 		}
 
-		occlusion_query_info* query = nullptr;
-
-		if (zcull_task_queue.task_stack.size() > 0)
-			query = zcull_task_queue.active_query;
-
-		if (query && query->active)
-		{
-			if (force_read || (!zcull_rendering_enabled || !testing_enabled || !zcull_surface_active))
-			{
-				end_occlusion_query(query);
-				query->active = false;
-				query->pending = true;
-			}
-		}
-		else
-		{
-			if (zcull_rendering_enabled && testing_enabled && zcull_surface_active)
-			{
-				//Find query
-				u32 free_index = synchronize_zcull_stats();
-				query = &occlusion_query_data[free_index];
-				zcull_task_queue.add(query);
-
-				begin_occlusion_query(query);
-				query->active = true;
-				query->result = 0;
-				query->num_draws = 0;
-			}
-		}
+		zcull_ctrl->set_enabled(this, zcull_rendering_enabled);
+		zcull_ctrl->set_active(this, zcull_rendering_enabled && testing_enabled && zcull_surface_active);
 	}
 
 	void thread::clear_zcull_stats(u32 type)
@@ -2144,113 +2142,50 @@ namespace rsx
 		if (g_cfg.video.disable_zcull_queries)
 			return;
 
-		if (type == CELL_GCM_ZPASS_PIXEL_CNT)
-		{
-			if (zcull_task_queue.active_query &&
-				zcull_task_queue.active_query->active &&
-				zcull_task_queue.active_query->num_draws > 0)
-			{
-				//discard active query results
-				check_zcull_status(false, true);
-				zcull_task_queue.active_query->pending = false;
+		zcull_ctrl->clear();
+	}
 
-				//re-enable cull stats if stats are enabled
-				check_zcull_status(false, false);
-				zcull_task_queue.active_query->num_draws = 0;
+	void thread::get_zcull_stats(u32 type, vm::addr_t sink)
+	{
+		u32 value = 0;
+		if (!g_cfg.video.disable_zcull_queries)
+		{
+			switch (type)
+			{
+			case CELL_GCM_ZPASS_PIXEL_CNT:
+			{
+				zcull_ctrl->read_report(this, sink, type);
+				return;
 			}
-
-			current_zcull_stats.clear();
-		}
-	}
-
-	u32 thread::get_zcull_stats(u32 type)
-	{
-		if (g_cfg.video.disable_zcull_queries)
-			return 0u;
-
-		if (zcull_task_queue.active_query &&
-			zcull_task_queue.active_query->active &&
-			current_zcull_stats.zpass_pixel_cnt == 0 &&
-			type == CELL_GCM_ZPASS_PIXEL_CNT)
-		{
-			//The zcull unit is still bound as the read is happening and there are no results ready
-			check_zcull_status(false, true);  //close current query
-			check_zcull_status(false, false); //start new query since stat counting is still active
-		}
-
-		switch (type)
-		{
-		case CELL_GCM_ZPASS_PIXEL_CNT:
-		{
-			if (current_zcull_stats.zpass_pixel_cnt > 0)
-				return UINT16_MAX;
-
-			synchronize_zcull_stats(true);
-			return (current_zcull_stats.zpass_pixel_cnt > 0) ? UINT16_MAX : 0;
-		}
-		case CELL_GCM_ZCULL_STATS:
-		case CELL_GCM_ZCULL_STATS1:
-		case CELL_GCM_ZCULL_STATS2:
-			//TODO
-			return UINT16_MAX;
-		case CELL_GCM_ZCULL_STATS3:
-		{
-			//Some kind of inverse value
-			if (current_zcull_stats.zpass_pixel_cnt > 0)
-				return 0;
-
-			synchronize_zcull_stats(true);
-			return (current_zcull_stats.zpass_pixel_cnt > 0) ? 0 : UINT16_MAX;
-		}
-		default:
-			LOG_ERROR(RSX, "Unknown zcull stat type %d", type);
-			return 0;
-		}
-	}
-
-	u32 thread::synchronize_zcull_stats(bool hard_sync)
-	{
-		if (!zcull_rendering_enabled || zcull_task_queue.pending == 0)
-			return 0;
-
-		u32 result = UINT16_MAX;
-
-		for (auto &query : zcull_task_queue.task_stack)
-		{
-			if (query == nullptr || query->active)
-				continue;
-
-			bool status = check_occlusion_query_status(query);
-			if (status == false && !hard_sync)
-				continue;
-
-			get_occlusion_query_result(query);
-			current_zcull_stats.zpass_pixel_cnt += query->result;
-
-			query->pending = false;
-			query = nullptr;
-			zcull_task_queue.pending--;
-		}
-
-		for (u32 i = 0; i < occlusion_query_count; ++i)
-		{
-			auto &query = occlusion_query_data[i];
-			if (!query.pending && !query.active)
+			case CELL_GCM_ZCULL_STATS:
+			case CELL_GCM_ZCULL_STATS1:
+			case CELL_GCM_ZCULL_STATS2:
+			case CELL_GCM_ZCULL_STATS3:
 			{
-				result = i;
+				//TODO
+				value = (type != CELL_GCM_ZCULL_STATS3)? UINT16_MAX : 0;
+				break;
+			}
+			default:
+				LOG_ERROR(RSX, "Unknown zcull stat type %d", type);
 				break;
 			}
 		}
 
-		if (result == UINT16_MAX && !hard_sync)
-			return synchronize_zcull_stats(true);
+		vm::ptr<CellGcmReportData> result = sink;
+		result->value = value;
+		result->padding = 0;
+		result->timer = timestamp();
+	}
 
-		return result;
+	void thread::sync()
+	{
+		zcull_ctrl->sync(this);
 	}
 
 	void thread::notify_zcull_info_changed()
 	{
-		check_zcull_status(false, false);
+		check_zcull_status(false);
 	}
 
 	//Pause/cont wrappers for FIFO ctrl. Never call this from rsx thread itself!
@@ -2356,4 +2291,385 @@ namespace rsx
 
 		return false;
 	}
+
+	namespace reports
+	{
+		void ZCULL_control::set_enabled(class ::rsx::thread* ptimer, bool state)
+		{
+			if (state != enabled)
+			{
+				enabled = state;
+
+				if (active && !enabled)
+					set_active(ptimer, false);
+			}
+		}
+
+		void ZCULL_control::set_active(class ::rsx::thread* ptimer, bool state)
+		{
+			if (state != active)
+			{
+				active = state;
+
+				if (state)
+				{
+					verify(HERE), enabled && m_current_task == nullptr;
+					allocate_new_query(ptimer);
+					begin_occlusion_query(m_current_task);
+				}
+				else
+				{
+					verify(HERE), m_current_task;
+					if (m_current_task->num_draws)
+					{
+						end_occlusion_query(m_current_task);
+						m_current_task->active = false;
+						m_current_task->pending = true;
+
+						m_pending_writes.push_back({});
+						m_pending_writes.back().query = m_current_task;
+					}
+					else
+					{
+						discard_occlusion_query(m_current_task);
+						m_current_task->active = false;
+					}
+
+					m_current_task = nullptr;
+				}
+			}
+		}
+
+		void ZCULL_control::read_report(::rsx::thread* ptimer, vm::addr_t sink, u32 type)
+		{
+			if (m_current_task)
+			{
+				m_current_task->owned = true;
+				end_occlusion_query(m_current_task);
+				m_pending_writes.push_back({});
+
+				m_current_task->active = false;
+				m_current_task->pending = true;
+				m_pending_writes.back().query = m_current_task;
+
+				allocate_new_query(ptimer);
+				begin_occlusion_query(m_current_task);
+			}
+			else
+			{
+				//Spam; send null query down the pipeline to copy the last result
+				//Might be used to capture a timestamp (verify)
+				m_pending_writes.push_back({});
+			}
+
+			auto forwarder = &m_pending_writes.back();
+			for (auto It = m_pending_writes.rbegin(); It != m_pending_writes.rend(); It++)
+			{
+				if (!It->sink)
+				{
+					It->counter_tag = m_statistics_tag_id;
+					It->due_tsc = m_tsc + m_cycles_delay;
+					It->sink = sink;
+					It->type = type;
+
+					if (forwarder != &(*It))
+					{
+						//Not the last one in the chain, forward the writing operation to the last writer
+						It->forwarder = forwarder;
+						It->query->owned = true;
+					}
+
+					continue;
+				}
+
+				break;
+			}
+		}
+
+		void ZCULL_control::allocate_new_query(::rsx::thread* ptimer)
+		{
+			int retries = 0;
+			while (!Emu.IsStopped())
+			{
+				for (int n = 0; n < occlusion_query_count; ++n)
+				{
+					if (m_occlusion_query_data[n].pending || m_occlusion_query_data[n].active)
+						continue;
+
+					m_current_task = &m_occlusion_query_data[n];
+					m_current_task->num_draws = 0;
+					m_current_task->result = 0;
+					m_current_task->sync_timestamp = 0;
+					m_current_task->active = true;
+					m_current_task->owned = false;
+					return;
+				}
+
+				if (retries > 0)
+				{
+					LOG_ERROR(RSX, "ZCULL report queue is overflowing!!");
+					m_statistics_map[m_statistics_tag_id] = 1;
+
+					verify(HERE), m_pending_writes.front().sink == 0;
+					m_pending_writes.resize(0);
+
+					for (auto &query : m_occlusion_query_data)
+					{
+						discard_occlusion_query(&query);
+						query.pending = false;
+					}
+
+					m_current_task = &m_occlusion_query_data[0];
+					m_current_task->num_draws = 0;
+					m_current_task->result = 0;
+					m_current_task->sync_timestamp = 0;
+					m_current_task->active = true;
+					m_current_task->owned = false;
+					return;
+				}
+
+				//All slots are occupied, try to pop the earliest entry
+				m_tsc += max_zcull_cycles_delay;
+				update(ptimer);
+
+				retries++;
+			}
+		}
+
+		void ZCULL_control::clear()
+		{
+			if (!m_pending_writes.empty())
+			{
+				//Remove any dangling/unclaimed queries as the information is lost anyway
+				auto valid_size = m_pending_writes.size();
+				for (auto It = m_pending_writes.rbegin(); It != m_pending_writes.rend(); ++It)
+				{
+					if (!It->sink)
+					{
+						discard_occlusion_query(It->query);
+						It->query->pending = false;
+						valid_size--;
+						continue;
+					}
+
+					break;
+				}
+
+				m_pending_writes.resize(valid_size);
+			}
+
+			m_statistics_tag_id++;
+			m_statistics_map[m_statistics_tag_id] = 0;
+		}
+
+		void ZCULL_control::on_draw()
+		{
+			if (m_current_task)
+				m_current_task->num_draws++;
+
+			m_cycles_delay = max_zcull_cycles_delay;
+		}
+
+		void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 value)
+		{
+			verify(HERE), sink;
+			vm::ptr<CellGcmReportData> out = sink;
+			out->value = value;
+			out->timer = timestamp;
+			out->padding = 0;
+		}
+
+		void ZCULL_control::sync(::rsx::thread* ptimer)
+		{
+			if (!m_pending_writes.empty())
+			{
+				u32 processed = 0;
+				const bool has_unclaimed = (m_pending_writes.back().sink == 0);
+
+				//Write all claimed reports unconditionally
+				for (auto &writer : m_pending_writes)
+				{
+					if (!writer.sink)
+						break;
+
+					auto query = writer.query;
+					u32 result = m_statistics_map[writer.counter_tag];
+
+					if (query)
+					{
+						verify(HERE), query->pending;
+
+						if (!result && query->num_draws)
+						{
+							get_occlusion_query_result(query);
+
+							if (query->result)
+							{
+								result += query->result;
+								m_statistics_map[writer.counter_tag] = result;
+							}
+						}
+						else
+						{
+							//Already have a hit, no need to retest
+							discard_occlusion_query(query);
+						}
+
+						query->pending = false;
+					}
+
+					if (!writer.forwarder)
+						//No other queries in the chain, write result
+						write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0);
+
+					processed++;
+				}
+
+				if (!has_unclaimed)
+				{
+					verify(HERE), processed == m_pending_writes.size();
+					m_pending_writes.resize(0);
+				}
+				else
+				{
+					auto remaining = m_pending_writes.size() - processed;
+					verify(HERE), remaining > 0;
+
+					if (remaining == 1)
+					{
+						m_pending_writes.front() = m_pending_writes.back();
+						m_pending_writes.resize(1);
+					}
+					else
+					{
+						std::move(m_pending_writes.begin() + processed, m_pending_writes.end(), m_pending_writes.begin());
+						m_pending_writes.resize(remaining);
+					}
+				}
+
+				//Delete all statistics caches but leave the current one
+				for (auto It = m_statistics_map.begin(); It != m_statistics_map.end(); )
+				{
+					if (It->first == m_statistics_tag_id)
+						++It;
+					else
+						It = m_statistics_map.erase(It);
+				}
+			}
+
+			//Critical, since its likely a WAIT_FOR_IDLE type has been processed, all results are considered available
+			m_cycles_delay = 2;
+		}
+
+		void ZCULL_control::update(::rsx::thread* ptimer)
+		{
+			m_tsc++;
+
+			if (m_pending_writes.empty())
+				return;
+
+			u32 stat_tag_to_remove = m_statistics_tag_id;
+			u32 processed = 0;
+			for (auto &writer : m_pending_writes)
+			{
+				if (!writer.sink)
+					break;
+
+				if (writer.counter_tag != stat_tag_to_remove &&
+					stat_tag_to_remove != m_statistics_tag_id)
+				{
+					//If the stat id is different from this stat id and the queue is advancing,
+					//its guaranteed that the previous tag has no remaining writes as the queue is ordered
+					m_statistics_map.erase(stat_tag_to_remove);
+					stat_tag_to_remove = m_statistics_tag_id;
+				}
+
+				auto query = writer.query;
+				u32 result = m_statistics_map[writer.counter_tag];
+
+				if (query)
+				{
+					verify(HERE), query->pending;
+
+					if (UNLIKELY(writer.due_tsc < m_tsc))
+					{
+						if (!result && query->num_draws)
+						{
+							get_occlusion_query_result(query);
+
+							if (query->result)
+							{
+								result += query->result;
+								m_statistics_map[writer.counter_tag] = result;
+							}
+						}
+						else
+						{
+							//No need to read this
+							discard_occlusion_query(query);
+						}
+					}
+					else
+					{
+						if (result || !query->num_draws)
+						{
+							//Not necessary to read the result anymore
+							discard_occlusion_query(query);
+						}
+						else
+						{
+							//Maybe we get lucky and results are ready
+							if (check_occlusion_query_status(query))
+							{
+								get_occlusion_query_result(query);
+								if (query->result)
+								{
+									result += query->result;
+									m_statistics_map[writer.counter_tag] = result;
+								}
+							}
+							else
+							{
+								//Too early; abort
+								break;
+							}
+						}
+					}
+
+					query->pending = false;
+				}
+
+				stat_tag_to_remove = writer.counter_tag;
+
+				//only zpass supported right now
+				if (!writer.forwarder)
+					//No other queries in the chain, write result
+					write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0);
+
+				processed++;
+			}
+
+			if (stat_tag_to_remove != m_statistics_tag_id)
+				m_statistics_map.erase(stat_tag_to_remove);
+
+			if (processed)
+			{
+				auto remaining = m_pending_writes.size() - processed;
+				if (remaining == 1)
+				{
+					m_pending_writes.front() = m_pending_writes.back();
+					m_pending_writes.resize(1);
+				}
+				else if (remaining)
+				{
+					std::move(m_pending_writes.begin() + processed, m_pending_writes.end(), m_pending_writes.begin());
+					m_pending_writes.resize(remaining);
+				}
+				else
+				{
+					m_pending_writes.resize(0);
+				}
+			}
+		}
+	}
 }
diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
index 0f0aded8ba..b82de85e06 100644
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@@ -157,64 +157,91 @@ namespace rsx
 		std::array<attribute_buffer_placement, 16> attribute_placement;
 	};
 
-	struct zcull_statistics
+	namespace reports
 	{
-		u32 zpass_pixel_cnt;
-		u32 zcull_stats;
-		u32 zcull_stats1;
-		u32 zcull_stats2;
-		u32 zcull_stats3;
-
-		void clear()
+		struct occlusion_query_info
 		{
-			zpass_pixel_cnt = zcull_stats = zcull_stats1 = zcull_stats2 = zcull_stats3 = 0;
-		}
-	};
+			u32 driver_handle;
+			u32 result;
+			u32 num_draws;
+			bool pending;
+			bool active;
+			bool owned;
 
-	struct occlusion_query_info
-	{
-		u32 driver_handle;
-		u32 result;
-		u32 num_draws;
-		bool pending;
-		bool active;
+			u64 sync_timestamp;
+		};
 
-		u64 sync_timestamp;
-		u64 external_flags;
-	};
-
-	struct occlusion_task
-	{
-		std::vector<occlusion_query_info*> task_stack;
-		occlusion_query_info* active_query = nullptr;
-		u32 pending = 0;
-
-		//Add one query to the task
-		void add(occlusion_query_info* query)
+		struct queued_report_write
 		{
-			active_query = query;
+			u32 type = CELL_GCM_ZPASS_PIXEL_CNT;
+			u32 counter_tag;
+			occlusion_query_info* query;
+			queued_report_write* forwarder;
+			vm::addr_t sink;
 
-			if (task_stack.size() > 0 && pending == 0)
-				task_stack.resize(0);
+			u32 due_tsc;
+		};
 
-			const auto empty_slots = task_stack.size() - pending;
-			if (empty_slots >= 4)
-			{
-				for (auto &_query : task_stack)
-				{
-					if (_query == nullptr)
-					{
-						_query = query;
-						pending++;
-						return;
-					}
-				}
-			}
+		struct ZCULL_control
+		{
+			//Delay in 'cycles' before a report update operation is forced to retire
+			//Larger values might give more performance but some engines (UE3) dont seem to wait for results and will flicker
+			//TODO: Determine the real max delay in real hardware
+			const u32 max_zcull_cycles_delay = 10;
 
-			task_stack.push_back(query);
-			pending++;
-		}
-	};
+			//Number of occlusion query slots available. Real hardware actually has far fewer units before choking
+			const u32 occlusion_query_count = 128;
+
+			bool active = false;
+			bool enabled = false;
+
+			std::array<occlusion_query_info, 128> m_occlusion_query_data = {};
+
+			occlusion_query_info* m_current_task = nullptr;
+			u32 m_statistics_tag_id = 0;
+			u32 m_tsc = 0;
+			u32 m_cycles_delay = 10;
+
+			std::vector<queued_report_write> m_pending_writes;
+			std::unordered_map<u32, u32> m_statistics_map;
+
+			ZCULL_control() {}
+			~ZCULL_control() {}
+
+			void set_enabled(class ::rsx::thread* ptimer, bool enabled);
+			void set_active(class ::rsx::thread* ptimer, bool active);
+
+			void write(vm::addr_t sink, u32 timestamp, u32 value);
+
+			//Read current zcull statistics into the address provided
+			void read_report(class ::rsx::thread* ptimer, vm::addr_t sink, u32 type);
+
+			//Sets up a new query slot and sets it to the current task
+			void allocate_new_query(class ::rsx::thread* ptimer);
+
+			//clears current stat block and increments stat_tag_id
+			void clear();
+
+			//forcefully flushes all
+			void sync(class ::rsx::thread* ptimer);
+
+			//call once every 'tick' to update
+			void update(class ::rsx::thread* ptimer);
+
+			//Draw call notification
+			void on_draw();
+
+			//Check for pending writes
+			bool has_pending() const { return (m_pending_writes.size() != 0); }
+
+			//Backend methods (optional, will return everything as always visible by default)
+			virtual void begin_occlusion_query(occlusion_query_info* /*query*/) {}
+			virtual void end_occlusion_query(occlusion_query_info* /*query*/) {}
+			virtual bool check_occlusion_query_status(occlusion_query_info* /*query*/) { return true; }
+			virtual void get_occlusion_query_result(occlusion_query_info* query) { query->result = UINT32_MAX; }
+			virtual void discard_occlusion_query(occlusion_query_info* /*query*/) {}
+		};
+	}
 
 	struct sampled_image_descriptor_base;
 
@@ -236,11 +263,7 @@ namespace rsx
 
 		//occlusion query
 		bool zcull_surface_active = false;
-		zcull_statistics current_zcull_stats;
-
-		const u32 occlusion_query_count = 128;
-		std::array<occlusion_query_info, 128> occlusion_query_data = {};
-		occlusion_task zcull_task_queue = {};
+		std::unique_ptr<reports::ZCULL_control> zcull_ctrl;
 
 		//framebuffer setup
 		rsx::gcm_framebuffer_info m_surface_info[rsx::limits::color_buffers_count];
@@ -382,17 +405,14 @@ namespace rsx
 		virtual void notify_tile_unbound(u32 /*tile*/) {}
 
 		//zcull
-		virtual void notify_zcull_info_changed();
-		virtual void clear_zcull_stats(u32 type);
-		virtual u32 get_zcull_stats(u32 type);
-		virtual void check_zcull_status(bool framebuffer_swap, bool force_read);
-		virtual u32 synchronize_zcull_stats(bool hard_sync = false);
-
-		virtual void begin_occlusion_query(occlusion_query_info* /*query*/) {}
-		virtual void end_occlusion_query(occlusion_query_info* /*query*/) {}
-		virtual bool check_occlusion_query_status(occlusion_query_info* /*query*/) { return true; }
-		virtual void get_occlusion_query_result(occlusion_query_info* query) { query->result = UINT32_MAX; }
+		void notify_zcull_info_changed();
+		void clear_zcull_stats(u32 type);
+		void check_zcull_status(bool framebuffer_swap);
+		void get_zcull_stats(u32 type, vm::addr_t sink);
 
+		//sync
+		void sync();
+		
 		gsl::span<const gsl::byte> get_raw_index_array(const std::vector<std::pair<u32, u32> >& draw_indexed_clause) const;
 		gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector<std::pair<u32, u32>>& vertex_ranges) const;
 
@@ -433,7 +453,7 @@ namespace rsx
 		void write_vertex_data_to_memory(const vertex_input_layout& layout, u32 first_vertex, u32 vertex_count, void *persistent_data, void *volatile_data);
 
 	private:
-		std::mutex m_mtx_task;
+		shared_mutex m_mtx_task;
 
 		struct internal_task_entry
 		{
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index b4e6ec66ba..acc3466a32 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -566,7 +566,7 @@ VKGSRender::VKGSRender() : GSRender()
 	//Occlusion
 	m_occlusion_query_pool.create((*m_device), DESCRIPTOR_MAX_DRAW_CALLS); //Enough for 4k draw calls per pass
 	for (int n = 0; n < 128; ++n)
-		occlusion_query_data[n].driver_handle = n;
+		m_occlusion_query_data[n].driver_handle = n;
 
 	//Generate frame contexts
 	VkDescriptorPoolSize uniform_buffer_pool = { VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 3 * DESCRIPTOR_MAX_DRAW_CALLS };
@@ -769,7 +769,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 {
 	vk::texture_cache::thrashed_set result;
 	{
-		std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
+		std::lock_guard<shared_mutex> lock(m_secondary_cb_guard);
 		result = std::move(m_texture_cache.invalidate_address(address, is_writing, false, m_secondary_command_buffer, m_memory_type_mapping, m_swapchain->get_graphics_queue()));
 	}
 
@@ -777,7 +777,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 		return false;
 
 	{
-		std::lock_guard<std::mutex> lock(m_sampler_mutex);
+		std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 		m_samplers_dirty.store(true);
 	}
 
@@ -795,7 +795,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 			//Always submit primary cb to ensure state consistency (flush pending changes such as image transitions)
 			vm::temporary_unlock();
 
-			std::lock_guard<std::mutex> lock(m_flush_queue_mutex);
+			std::lock_guard<shared_mutex> lock(m_flush_queue_mutex);
 
 			m_flush_requests.post(sync_timestamp == 0ull);
 			has_queue_ref = true;
@@ -846,13 +846,13 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 
 void VKGSRender::on_notify_memory_unmapped(u32 address_base, u32 size)
 {
-	std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
+	std::lock_guard<shared_mutex> lock(m_secondary_cb_guard);
 	if (m_texture_cache.invalidate_range(address_base, size, true, true, false,
 		m_secondary_command_buffer, m_memory_type_mapping, m_swapchain->get_graphics_queue()).violation_handled)
 	{
 		m_texture_cache.purge_dirty();
 		{
-			std::lock_guard<std::mutex> lock(m_sampler_mutex);
+			std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 			m_samplers_dirty.store(true);
 		}
 	}
@@ -866,7 +866,7 @@ void VKGSRender::notify_tile_unbound(u32 tile)
 	//m_rtts.invalidate_surface_address(addr, false);
 
 	{
-		std::lock_guard<std::mutex> lock(m_sampler_mutex);
+		std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 		m_samplers_dirty.store(true);
 	}
 }
@@ -903,6 +903,7 @@ void VKGSRender::check_heap_status()
 			m_attrib_ring_info.reset_allocation_stats();
 			m_texture_upload_buffer_ring_info.reset_allocation_stats();
 			m_current_frame->reset_heap_ptrs();
+			m_last_heap_sync_time = get_system_time();
 		}
 		else
 		{
@@ -1063,7 +1064,7 @@ void VKGSRender::end()
 	std::chrono::time_point<steady_clock> textures_start = vertex_end;
 	//Load textures
 	{
-		std::lock_guard<std::mutex> lock(m_sampler_mutex);
+		std::lock_guard<shared_mutex> lock(m_sampler_mutex);
 		bool update_framebuffer_sourced = false;
 
 		if (surface_store_tag != m_rtts.cache_tag)
@@ -1356,40 +1357,15 @@ void VKGSRender::end()
 		occlusion_id = m_occlusion_query_pool.find_free_slot();
 		if (occlusion_id == UINT32_MAX)
 		{
-			bool free_slot_found = false;
-			u32 index_to_free = UINT32_MAX;
-			u64 earliest_timestamp = UINT64_MAX;
+			m_tsc += 100;
+			update(this);
 
-			//flush occlusion queries
-			for (auto It : m_occlusion_map)
+			occlusion_id = m_occlusion_query_pool.find_free_slot();
+			if (occlusion_id == UINT32_MAX)
 			{
-				u32 index = It.first;
-				auto query = &occlusion_query_data[index];
-				if (check_occlusion_query_status(query))
-				{
-					free_slot_found = true;
-					get_occlusion_query_result(query);
-					break;
-				}
-
-				if (query->sync_timestamp < earliest_timestamp)
-				{
-					index_to_free = index;
-					earliest_timestamp = query->sync_timestamp;
-				}
+				LOG_ERROR(RSX, "Occlusion pool overflow");
+				if (m_current_task) m_current_task->result = 1;
 			}
-
-			if (free_slot_found)
-			{
-				occlusion_id = m_occlusion_query_pool.find_free_slot();
-			}
-			else
-			{
-				get_occlusion_query_result(&occlusion_query_data[index_to_free]);
-				occlusion_id = m_occlusion_query_pool.find_free_slot();
-			}
-
-			verify(HERE), occlusion_id != UINT32_MAX;
 		}
 	}
 
@@ -1441,7 +1417,7 @@ void VKGSRender::end()
 	const bool is_emulated_restart = (!primitive_emulated && rsx::method_registers.restart_index_enabled() && vk::emulate_primitive_restart() && rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed);
 	const bool single_draw = !supports_multidraw || (!is_emulated_restart && (rsx::method_registers.current_draw_clause.first_count_commands.size() <= 1 || rsx::method_registers.current_draw_clause.is_disjoint_primitive));
 
-	if (m_occlusion_query_active)
+	if (m_occlusion_query_active && (occlusion_id != UINT32_MAX))
 	{
 		//Begin query
 		m_occlusion_query_pool.begin_query(*m_current_command_buffer, occlusion_id);
@@ -1500,7 +1476,7 @@ void VKGSRender::end()
 		}
 	}
 
-	if (m_occlusion_query_active)
+	if (m_occlusion_query_active && (occlusion_id != UINT32_MAX))
 	{
 		//End query
 		m_occlusion_query_pool.end_query(*m_current_command_buffer, occlusion_id);
@@ -1565,6 +1541,7 @@ void VKGSRender::on_init_thread()
 
 	GSRender::on_init_thread();
 	rsx_thread = std::this_thread::get_id();
+	zcull_ctrl.reset(static_cast<::rsx::reports::ZCULL_control*>(this));
 
 	if (!supports_native_ui)
 	{
@@ -1627,6 +1604,7 @@ void VKGSRender::on_init_thread()
 
 void VKGSRender::on_exit()
 {
+	zcull_ctrl.release();
 	return GSRender::on_exit();
 }
 
@@ -2002,7 +1980,7 @@ void VKGSRender::do_local_task(bool /*idle*/)
 {
 	if (m_flush_requests.pending())
 	{
-		std::lock_guard<std::mutex> lock(m_flush_queue_mutex);
+		std::lock_guard<shared_mutex> lock(m_flush_queue_mutex);
 
 		//TODO: Determine if a hard sync is necessary
 		//Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
@@ -2610,8 +2588,8 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 		}
 	}
 
-	if ((window_clip_width && window_clip_width != clip_width) ||
-		(window_clip_height && window_clip_height != clip_height))
+	if ((window_clip_width && window_clip_width < clip_width) ||
+		(window_clip_height && window_clip_height < clip_height))
 	{
 		LOG_ERROR(RSX, "Unexpected window clip dimensions: window_clip=%dx%d, surface_clip=%dx%d",
 			window_clip_width, window_clip_height, clip_width, clip_height);
@@ -2818,7 +2796,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 		m_draw_fbo.reset(new vk::framebuffer_holder(*m_device, current_render_pass, fbo_width, fbo_height, std::move(fbo_images)));
 	}
 
-	check_zcull_status(true, false);
+	check_zcull_status(true);
 }
 
 void VKGSRender::reinitialize_swapchain()
@@ -3194,7 +3172,7 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst
 	check_heap_status();
 
 	//Stop all parallel operations until this is finished
-	std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
+	std::lock_guard<shared_mutex> lock(m_secondary_cb_guard);
 
 	auto result = m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer);
 	m_current_command_buffer->begin();
@@ -3240,31 +3218,32 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst
 	return false;
 }
 
-void VKGSRender::clear_zcull_stats(u32 type)
-{
-	rsx::thread::clear_zcull_stats(type);
-	m_occlusion_map.clear();
-	m_occlusion_query_pool.reset_all(*m_current_command_buffer);
-}
-
-void VKGSRender::begin_occlusion_query(rsx::occlusion_query_info* query)
+void VKGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
 	query->result = 0;
-	query->sync_timestamp = get_system_time();
+	//query->sync_timestamp = get_system_time();
 	m_active_query_info = query;
 	m_occlusion_query_active = true;
 }
 
-void VKGSRender::end_occlusion_query(rsx::occlusion_query_info* query)
+void VKGSRender::end_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
 	m_occlusion_query_active = false;
 	m_active_query_info = nullptr;
 
-	flush_command_queue();
+	//Avoid stalling later if this query is already tied to a report
+	if (query->num_draws && query->owned && !m_flush_requests.pending())
+	{
+		m_flush_requests.post(false);
+		m_flush_requests.remove_one();
+	}
 }
 
-bool VKGSRender::check_occlusion_query_status(rsx::occlusion_query_info* query)
+bool VKGSRender::check_occlusion_query_status(rsx::reports::occlusion_query_info* query)
 {
+	if (!query->num_draws)
+		return true;
+
 	auto found = m_occlusion_map.find(query->driver_handle);
 	if (found == m_occlusion_map.end())
 		return true;
@@ -3274,16 +3253,26 @@ bool VKGSRender::check_occlusion_query_status(rsx::occlusion_query_info* query)
 		return true;
 
 	if (data.command_buffer_to_wait == m_current_command_buffer)
+	{
+		if (!m_flush_requests.pending())
+		{
+			//Likely to be read at some point in the near future, submit now to avoid stalling later
+			m_flush_requests.post(false);
+			m_flush_requests.remove_one();
+		}
+
 		return false;
+	}
 
 	if (data.command_buffer_to_wait->pending)
+		//Don't bother poking the state, a flush later will likely do it for free
 		return false;
 
 	u32 oldest = data.indices.front();
 	return m_occlusion_query_pool.check_query_status(oldest);
 }
 
-void VKGSRender::get_occlusion_query_result(rsx::occlusion_query_info* query)
+void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* query)
 {
 	auto found = m_occlusion_map.find(query->driver_handle);
 	if (found == m_occlusion_map.end())
@@ -3293,20 +3282,32 @@ void VKGSRender::get_occlusion_query_result(rsx::occlusion_query_info* query)
 	if (data.indices.size() == 0)
 		return;
 
-	if (data.command_buffer_to_wait == m_current_command_buffer)
-		flush_command_queue(); //Should hard sync, but this should almost never ever happen
-
-	if (data.command_buffer_to_wait->pending)
-		data.command_buffer_to_wait->wait();
-
-	//Gather data
-	for (const auto occlusion_id : data.indices)
+	if (query->num_draws)
 	{
-		//We only need one hit
-		if (auto value = m_occlusion_query_pool.get_query_result(occlusion_id))
+		if (data.command_buffer_to_wait == m_current_command_buffer)
 		{
-			query->result = 1;
-			break;
+			flush_command_queue();
+
+			//Clear any deferred flush requests from previous call to get_query_status()
+			if (m_flush_requests.pending())
+			{
+				m_flush_requests.clear_pending_flag();
+				m_flush_requests.consumer_wait();
+			}
+		}
+
+		if (data.command_buffer_to_wait->pending)
+			data.command_buffer_to_wait->wait();
+
+		//Gather data
+		for (const auto occlusion_id : data.indices)
+		{
+			//We only need one hit
+			if (auto value = m_occlusion_query_pool.get_query_result(occlusion_id))
+			{
+				query->result = 1;
+				break;
+			}
 		}
 	}
 
@@ -3314,6 +3315,26 @@ void VKGSRender::get_occlusion_query_result(rsx::occlusion_query_info* query)
 	m_occlusion_map.erase(query->driver_handle);
 }
 
+void VKGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query)
+{
+	if (m_active_query_info == query)
+	{
+		m_occlusion_query_active = false;
+		m_active_query_info = nullptr;
+	}
+
+	auto found = m_occlusion_map.find(query->driver_handle);
+	if (found == m_occlusion_map.end())
+		return;
+
+	auto &data = found->second;
+	if (data.indices.size() == 0)
+		return;
+
+	m_occlusion_query_pool.reset_queries(*m_current_command_buffer, data.indices);
+	m_occlusion_map.erase(query->driver_handle);
+}
+
 void VKGSRender::shell_do_cleanup()
 {
 	//TODO: Guard this
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index 797896733f..2596b71345 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -38,7 +38,7 @@ namespace vk
 
 //Heap allocation sizes in MB
 //NOTE: Texture uploads can be huge, upto 16MB for a single texture (4096x4096px)
-#define VK_ATTRIB_RING_BUFFER_SIZE_M 256
+#define VK_ATTRIB_RING_BUFFER_SIZE_M 384
 #define VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M 256
 #define VK_UBO_RING_BUFFER_SIZE_M 128
 #define VK_INDEX_RING_BUFFER_SIZE_M 64
@@ -55,7 +55,7 @@ struct command_buffer_chunk: public vk::command_buffer
 
 	std::atomic_bool pending = { false };
 	std::atomic<u64> last_sync = { 0 };
-	std::mutex guard_mutex;
+	shared_mutex guard_mutex;
 
 	command_buffer_chunk()
 	{}
@@ -97,7 +97,7 @@ struct command_buffer_chunk: public vk::command_buffer
 	{
 		if (vkGetFenceStatus(m_device, submit_fence) == VK_SUCCESS)
 		{
-			std::lock_guard<std::mutex> lock(guard_mutex);
+			std::lock_guard<shared_mutex> lock(guard_mutex);
 
 			if (pending)
 			{
@@ -111,7 +111,7 @@ struct command_buffer_chunk: public vk::command_buffer
 
 	void wait()
 	{
-		std::lock_guard<std::mutex> lock(guard_mutex);
+		std::lock_guard<shared_mutex> lock(guard_mutex);
 
 		if (!pending)
 			return;
@@ -244,7 +244,7 @@ struct flush_request_task
 	}
 };
 
-class VKGSRender : public GSRender
+class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 {
 private:
 	VKFragmentProgram m_fragment_prog;
@@ -265,7 +265,7 @@ private:
 	std::unique_ptr<vk::depth_scaling_pass> m_depth_scaler;
 	std::unique_ptr<vk::ui_overlay_renderer> m_ui_renderer;
 
-	std::mutex m_sampler_mutex;
+	shared_mutex m_sampler_mutex;
 	u64 surface_store_tag = 0;
 	std::atomic_bool m_samplers_dirty = { true };
 	std::array<std::unique_ptr<rsx::sampled_image_descriptor_base>, rsx::limits::fragment_textures_count> fs_sampler_state = {};
@@ -292,10 +292,10 @@ private:
 	vk::command_pool m_command_buffer_pool;
 	vk::occlusion_query_pool m_occlusion_query_pool;
 	bool m_occlusion_query_active = false;
-	rsx::occlusion_query_info *m_active_query_info = nullptr;
+	rsx::reports::occlusion_query_info *m_active_query_info = nullptr;
 	std::unordered_map<u32, occlusion_data> m_occlusion_map;
 
-	std::mutex m_secondary_cb_guard;
+	shared_mutex m_secondary_cb_guard;
 	vk::command_pool m_secondary_command_buffer_pool;
 	vk::command_buffer m_secondary_command_buffer;  //command buffer used for setup operations
 
@@ -346,7 +346,7 @@ private:
 	bool m_flush_draw_buffers = false;
 	std::atomic<int> m_last_flushable_cb = {-1 };
 	
-	std::mutex m_flush_queue_mutex;
+	shared_mutex m_flush_queue_mutex;
 	flush_request_task m_flush_requests;
 
 	std::thread::id rsx_thread;
@@ -400,11 +400,11 @@ public:
 	void write_buffers();
 	void set_viewport();
 
-	void clear_zcull_stats(u32 type) override;
-	void begin_occlusion_query(rsx::occlusion_query_info* query) override;
-	void end_occlusion_query(rsx::occlusion_query_info* query) override;
-	bool check_occlusion_query_status(rsx::occlusion_query_info* query) override;
-	void get_occlusion_query_result(rsx::occlusion_query_info* query) override;
+	void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
+	void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
+	bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override;
+	void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
+	void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;
 
 protected:
 	void begin() override;
diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
index 4171af71ea..a424604fb7 100644
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@@ -1,7 +1,7 @@
 #include "stdafx.h"
 #include "VKHelpers.h"
 
-#include <mutex>
+#include "Utilities/mutex.h"
 
 namespace vk
 {
@@ -24,7 +24,7 @@ namespace vk
 	u64 g_num_total_frames = 0;
 
 	//global submit guard to prevent race condition on queue submit
-	std::mutex g_submit_mutex;
+	shared_mutex g_submit_mutex;
 
 	VKAPI_ATTR void* VKAPI_CALL mem_realloc(void* pUserData, void* pOriginal, size_t size, size_t alignment, VkSystemAllocationScope allocationScope)
 	{
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index ac115cea5f..3dc556b911 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -57,6 +57,7 @@ namespace rsx
 	{
 		void set_reference(thread* rsx, u32 _reg, u32 arg)
 		{
+			rsx->sync();
 			rsx->ctrl->ref.exchange(arg);
 		}
 
@@ -112,6 +113,7 @@ namespace rsx
 
 		void semaphore_release(thread* rsx, u32 _reg, u32 arg)
 		{
+			rsx->sync();
 			rsx->sync_point_request = true;
 			const u32 addr = get_address(method_registers.semaphore_offset_406e(), method_registers.semaphore_context_dma_406e());
 
@@ -164,6 +166,8 @@ namespace rsx
 			{
 				//
 			}
+
+			rsx->sync();
 			auto& sema = vm::_ref<RsxReports>(rsx->label_addr);
 			sema.semaphore[index].val = arg;
 			sema.semaphore[index].pad = 0;
@@ -177,8 +181,9 @@ namespace rsx
 			{
 				//
 			}
-			u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
 
+			rsx->sync();
+			u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
 			auto& sema = vm::_ref<RsxReports>(rsx->label_addr);
 			sema.semaphore[index].val = val;
 			sema.semaphore[index].pad = 0;
@@ -433,16 +438,14 @@ namespace rsx
 			case CELL_GCM_ZCULL_STATS1:
 			case CELL_GCM_ZCULL_STATS2:
 			case CELL_GCM_ZCULL_STATS3:
-				result->value = rsx->get_zcull_stats(type);
-				LOG_WARNING(RSX, "NV4097_GET_REPORT: Unimplemented type %d", type);
+				rsx->get_zcull_stats(type, address_ptr);
 				break;
 			default:
 				LOG_ERROR(RSX, "NV4097_GET_REPORT: Bad type %d", type);
+				result->timer = rsx->timestamp();
+				result->padding = 0;
 				break;
 			}
-
-			result->timer = rsx->timestamp();
-			result->padding = 0;
 		}
 
 		void clear_report_value(thread* rsx, u32 _reg, u32 arg)
@@ -450,10 +453,7 @@ namespace rsx
 			switch (arg)
 			{
 			case CELL_GCM_ZPASS_PIXEL_CNT:
-				LOG_WARNING(RSX, "TODO: NV4097_CLEAR_REPORT_VALUE: ZPASS_PIXEL_CNT");
-				break;
 			case CELL_GCM_ZCULL_STATS:
-				LOG_WARNING(RSX, "TODO: NV4097_CLEAR_REPORT_VALUE: ZCULL_STATS");
 				break;
 			default:
 				LOG_ERROR(RSX, "NV4097_CLEAR_REPORT_VALUE: Bad type: %d", arg);
@@ -492,6 +492,7 @@ namespace rsx
 				return;
 			}
 
+			rsx->sync();
 			vm::ptr<CellGcmReportData> result = address_ptr;
 			rsx->conditional_render_test_failed = (result->value == 0);
 		}
@@ -514,6 +515,11 @@ namespace rsx
 			rsx->notify_zcull_info_changed();
 		}
 
+		void sync(thread* rsx, u32, u32)
+		{
+			rsx->sync();
+		}
+
 		void set_surface_dirty_bit(thread* rsx, u32, u32)
 		{
 			rsx->m_rtts_dirty = true;
@@ -1678,6 +1684,9 @@ namespace rsx
 		bind<NV4097_SET_STENCIL_TEST_ENABLE, nv4097::set_surface_options_dirty_bit>();
 		bind<NV4097_SET_DEPTH_MASK, nv4097::set_surface_options_dirty_bit>();
 		bind<NV4097_SET_COLOR_MASK, nv4097::set_surface_options_dirty_bit>();
+		bind<NV4097_WAIT_FOR_IDLE, nv4097::sync>();
+		bind<NV4097_ZCULL_SYNC, nv4097::sync>();
+		bind<NV4097_SET_CONTEXT_DMA_REPORT, nv4097::sync>();
 
 		//NV308A
 		bind_range<NV308A_COLOR, 1, 256, nv308a::color>();