gl: Move vertex processing to the GPU

- Significant gains from greatly reduced CPU work - Also reorders command submission in end() to improve throughput - Refactors most of the vertex buffer handling - All vertex processing is moved GPU side
2025-04-21 03:55:32 +00:00 · 2017-07-31 14:38:28 +03:00 · 2017-07-31 14:38:28 +03:00 · d54c2dd39a
commit d54c2dd39a
parent e668ecd453
21 changed files with 1025 additions and 1149 deletions
--- a/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp
+++ b/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp
@ -28,7 +28,7 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
 		return false;
 	if (binary1.data.size() != binary2.data.size())
 		return false;
-	if (binary1.rsx_vertex_inputs != binary2.rsx_vertex_inputs)
+	if (!binary1.skip_vertex_input_check && binary1.rsx_vertex_inputs != binary2.rsx_vertex_inputs)
 		return false;

 	const qword *instBuffer1 = (const qword*)binary1.data.data();
--- a/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp
+++ b/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp
@ -89,7 +89,7 @@ void D3D12GSRender::upload_and_bind_scale_offset_matrix(size_t descriptorIndex)
 	void *mapped_buffer = m_buffer_data.map<void>(CD3DX12_RANGE(heap_offset, heap_offset + 512));
 	fill_scale_offset_data(mapped_buffer, true);
 	fill_user_clip_data((char*)mapped_buffer + 64);
-	fill_fragment_state_buffer((char *)mapped_buffer + 128, m_fragment_program);
+	fill_fragment_state_buffer((char *)mapped_buffer + 128, current_fragment_program);
 	m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + 512));

 	D3D12_CONSTANT_BUFFER_VIEW_DESC constant_buffer_view_desc = {
@ -124,7 +124,7 @@ void D3D12GSRender::upload_and_bind_vertex_shader_constants(size_t descriptor_in
 D3D12_CONSTANT_BUFFER_VIEW_DESC D3D12GSRender::upload_fragment_shader_constants()
 {
 	// Get constant from fragment program
-	size_t buffer_size = m_pso_cache.get_fragment_constants_buffer_size(m_fragment_program);
+	size_t buffer_size = m_pso_cache.get_fragment_constants_buffer_size(current_fragment_program);
 	// Multiple of 256 never 0
 	buffer_size = (buffer_size + 255) & ~255;

@ -132,7 +132,7 @@ D3D12_CONSTANT_BUFFER_VIEW_DESC D3D12GSRender::upload_fragment_shader_constants(

 	size_t offset = 0;
 	float *mapped_buffer = m_buffer_data.map<float>(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
-	m_pso_cache.fill_fragment_constants_buffer({ mapped_buffer, ::narrow<int>(buffer_size) }, m_fragment_program);
+	m_pso_cache.fill_fragment_constants_buffer({ mapped_buffer, ::narrow<int>(buffer_size) }, current_fragment_program);
 	m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));

 	return {
--- a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp
+++ b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp
@ -329,7 +329,7 @@ void D3D12GSRender::end()
 	std::chrono::time_point<steady_clock> program_load_end = steady_clock::now();
 	m_timers.program_load_duration += std::chrono::duration_cast<std::chrono::microseconds>(program_load_end - program_load_start).count();

-	if (!m_fragment_program.valid)
+	if (!current_fragment_program.valid)
 	{
 		rsx::thread::end();
 		return;
--- a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h
+++ b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h
@ -61,8 +61,6 @@ private:
 	data_cache m_texture_cache;
 	bool invalidate_address(u32 addr);

-	RSXVertexProgram m_vertex_program;
-	RSXFragmentProgram m_fragment_program;
 	PipelineStateObjectCache m_pso_cache;
 	std::tuple<ComPtr<ID3D12PipelineState>, size_t, size_t> m_current_pso;

--- a/rpcs3/Emu/RSX/D3D12/D3D12PipelineState.cpp
+++ b/rpcs3/Emu/RSX/D3D12/D3D12PipelineState.cpp
@ -53,10 +53,10 @@ void D3D12GSRender::load_program()
 		return std::make_tuple(true, native_pitch);
 	};

-	m_vertex_program = get_current_vertex_program();
-	m_fragment_program = get_current_fragment_program(rtt_lookup_func);
+	get_current_vertex_program();
+	get_current_fragment_program(rtt_lookup_func);

-	if (!m_fragment_program.valid)
+	if (!current_fragment_program.valid)
 		return;

 	D3D12PipelineProperties prop = {};
@ -308,12 +308,12 @@ void D3D12GSRender::load_program()
 		}
 	}

-	m_current_pso = m_pso_cache.getGraphicPipelineState(m_vertex_program, m_fragment_program, prop, m_device.Get(), m_shared_root_signature.Get());
+	m_current_pso = m_pso_cache.getGraphicPipelineState(current_vertex_program, current_fragment_program, prop, m_device.Get(), m_shared_root_signature.Get());
 	return;
 }

 std::pair<std::string, std::string> D3D12GSRender::get_programs() const
 {
-	return std::make_pair(m_pso_cache.get_transform_program(m_vertex_program).content, m_pso_cache.get_shader_program(m_fragment_program).content);
+	return std::make_pair(m_pso_cache.get_transform_program(current_vertex_program).content, m_pso_cache.get_shader_program(current_fragment_program).content);
 }
 #endif
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -306,18 +306,12 @@ namespace

 void GLGSRender::end()
 {
-	std::chrono::time_point<steady_clock> program_start = steady_clock::now();
-	//Load program here since it is dependent on vertex state
-
-	if (skip_frame || !framebuffer_status_valid || (conditional_render_enabled && conditional_render_test_failed) || !load_program())
+	if (skip_frame || !framebuffer_status_valid || (conditional_render_enabled && conditional_render_test_failed) || !check_program_state())
 	{
 		rsx::thread::end();
 		return;
 	}

-	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
-	m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
-
 	if (manually_flush_ring_buffers)
 	{
 		//Use approximations to reseve space. This path is mostly for debug purposes anyway
@ -329,6 +323,32 @@ void GLGSRender::end()
 		m_index_ring_buffer->reserve_storage_on_heap(16 * 1024);
 	}

+	//Do vertex upload before RTT prep / texture lookups to give the driver time to push data
+	u32 vertex_draw_count;
+	u32 actual_vertex_count;
+	u32 vertex_base;
+	std::optional<std::tuple<GLenum, u32> > indexed_draw_info;
+	std::tie(vertex_draw_count, actual_vertex_count, vertex_base, indexed_draw_info) = set_vertex_buffer();
+
+	std::chrono::time_point<steady_clock> program_start = steady_clock::now();
+	//Load program here since it is dependent on vertex state
+
+	load_program(vertex_base, actual_vertex_count);
+
+	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
+	m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
+
+	if (manually_flush_ring_buffers)
+	{
+		m_attrib_ring_buffer->unmap();
+		m_index_ring_buffer->unmap();
+	}
+	else
+	{
+		//DMA push; not needed with MAP_COHERENT
+		//glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+	}
+
 	//Check if depth buffer is bound and valid
 	//If ds is not initialized clear it; it seems new depth textures should have depth cleared
 	auto copy_rtt_contents = [](gl::render_target *surface)
@ -470,20 +490,6 @@ void GLGSRender::end()
 	std::chrono::time_point<steady_clock> textures_end = steady_clock::now();
 	m_textures_upload_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(textures_end - textures_start).count();

-	u32 vertex_draw_count = m_last_vertex_count;
-	std::optional<std::tuple<GLenum, u32> > indexed_draw_info;
-	bool skip_upload = false;
-
-	if (!is_probable_instanced_draw())
-	{
-		std::tie(vertex_draw_count, indexed_draw_info) = set_vertex_buffer();
-		m_last_vertex_count = vertex_draw_count;
-	}
-	else
-	{
-		skip_upload = true;
-	}
-
 	std::chrono::time_point<steady_clock> draw_start = steady_clock::now();

 	if (g_cfg.video.debug_output)
@ -491,45 +497,31 @@ void GLGSRender::end()
 		m_program->validate();
 	}

-	if (manually_flush_ring_buffers)
+	if (indexed_draw_info)
 	{
-		m_attrib_ring_buffer->unmap();
-		m_index_ring_buffer->unmap();
-	}
+		const GLenum index_type = std::get<0>(indexed_draw_info.value());
+		const u32 index_offset = std::get<1>(indexed_draw_info.value());

-	if (indexed_draw_info || (skip_upload && m_last_draw_indexed == true))
-	{
 		if (__glcheck gl_state.enable(rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART))
 		{
-			GLenum index_type = (skip_upload)? m_last_ib_type: std::get<0>(indexed_draw_info.value());
 			__glcheck glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT)? 0xffff: 0xffffffff);
 		}

-		m_last_draw_indexed = true;
-
-		if (!skip_upload)
-		{
-			m_last_ib_type = std::get<0>(indexed_draw_info.value());
-			m_last_index_offset = std::get<1>(indexed_draw_info.value());
-		}
-
-		__glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, m_last_ib_type, (GLvoid *)(std::ptrdiff_t)m_last_index_offset);
+		__glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
 	}
 	else
 	{
-		draw_fbo.draw_arrays(rsx::method_registers.current_draw_clause.primitive, vertex_draw_count);
-		m_last_draw_indexed = false;
+		glDrawArrays(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), 0, vertex_draw_count);
 	}

 	m_attrib_ring_buffer->notify();
 	m_index_ring_buffer->notify();
-	m_scale_offset_buffer->notify();
+	m_vertex_state_buffer->notify();
 	m_fragment_constants_buffer->notify();
 	m_transform_constants_buffer->notify();

 	std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
 	m_draw_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(draw_end - draw_start).count();
-
 	m_draw_calls++;

 	if (zcull_task_queue.active_query &&
@ -537,7 +529,6 @@ void GLGSRender::end()
 		zcull_task_queue.active_query->num_draws++;

 	synchronize_buffers();
-
 	rsx::thread::end();
 }

@ -620,13 +611,21 @@ void GLGSRender::on_init_thread()

 	const u32 texture_index_offset = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;

-	for (int index = 0; index < rsx::limits::vertex_count; ++index)
+	//Array stream buffer
 	{
-		auto &tex = m_gl_attrib_buffers[index];
+		auto &tex = m_gl_persistent_stream_buffer;
 		tex.create();
 		tex.set_target(gl::texture::target::textureBuffer);
+		glActiveTexture(GL_TEXTURE0 + texture_index_offset);
+		tex.bind();
+	}

-		glActiveTexture(GL_TEXTURE0 + texture_index_offset + index);
+	//Register stream buffer
+	{
+		auto &tex = m_gl_volatile_stream_buffer;
+		tex.create();
+		tex.set_target(gl::texture::target::textureBuffer);
+		glActiveTexture(GL_TEXTURE0 + texture_index_offset + 1);
 		tex.bind();
 	}

@ -645,7 +644,7 @@ void GLGSRender::on_init_thread()
 		m_attrib_ring_buffer.reset(new gl::legacy_ring_buffer());
 		m_transform_constants_buffer.reset(new gl::legacy_ring_buffer());
 		m_fragment_constants_buffer.reset(new gl::legacy_ring_buffer());
-		m_scale_offset_buffer.reset(new gl::legacy_ring_buffer());
+		m_vertex_state_buffer.reset(new gl::legacy_ring_buffer());
 		m_index_ring_buffer.reset(new gl::legacy_ring_buffer());
 	}
 	else
@ -653,7 +652,7 @@ void GLGSRender::on_init_thread()
 		m_attrib_ring_buffer.reset(new gl::ring_buffer());
 		m_transform_constants_buffer.reset(new gl::ring_buffer());
 		m_fragment_constants_buffer.reset(new gl::ring_buffer());
-		m_scale_offset_buffer.reset(new gl::ring_buffer());
+		m_vertex_state_buffer.reset(new gl::ring_buffer());
 		m_index_ring_buffer.reset(new gl::ring_buffer());
 	}

@ -661,7 +660,7 @@ void GLGSRender::on_init_thread()
 	m_index_ring_buffer->create(gl::buffer::target::element_array, 64 * 0x100000);
 	m_transform_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
 	m_fragment_constants_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
-	m_scale_offset_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
+	m_vertex_state_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);

 	m_vao.element_array_buffer = *m_index_ring_buffer;

@ -704,7 +703,7 @@ void GLGSRender::on_init_thread()

 void GLGSRender::on_exit()
 {
-	glDisable(GL_VERTEX_PROGRAM_POINT_SIZE);
+	glFinish();

 	m_prog_buffer.clear();

@ -728,10 +727,8 @@ void GLGSRender::on_exit()
 		m_vao.remove();
 	}

-	for (gl::texture &tex : m_gl_attrib_buffers)
-	{
-		tex.remove();
-	}
+	m_gl_persistent_stream_buffer.remove();
+	m_gl_volatile_stream_buffer.remove();

 	for (auto &sampler : m_gl_sampler_states)
 	{
@ -753,9 +750,9 @@ void GLGSRender::on_exit()
 		m_fragment_constants_buffer->remove();
 	}

-	if (m_scale_offset_buffer)
+	if (m_vertex_state_buffer)
 	{
-		m_scale_offset_buffer->remove();
+		m_vertex_state_buffer->remove();
 	}

 	if (m_index_ring_buffer)
@ -865,7 +862,7 @@ bool GLGSRender::do_method(u32 cmd, u32 arg)
 	return false;
 }

-bool GLGSRender::load_program()
+bool GLGSRender::check_program_state()
 {
 	auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture &tex, bool is_depth) -> std::tuple<bool, u16>
 	{
@ -887,12 +884,19 @@ bool GLGSRender::load_program()
 		return std::make_tuple(true, surface->get_native_pitch());
 	};

-	RSXFragmentProgram fragment_program = get_current_fragment_program(rtt_lookup_func);
-	if (!fragment_program.valid) return false;
+	get_current_fragment_program(rtt_lookup_func);

-	RSXVertexProgram vertex_program = get_current_vertex_program();
+	if (current_fragment_program.valid == false)
+		return false;

-	u32 unnormalized_rtts = 0;
+	get_current_vertex_program();
+	return true;
+}
+
+void GLGSRender::load_program(u32 vertex_base, u32 vertex_count)
+{
+	auto &fragment_program = current_fragment_program;
+	auto &vertex_program = current_vertex_program;

 	for (auto &vtx : vertex_program.rsx_vertex_inputs)
 	{
@ -906,12 +910,13 @@ bool GLGSRender::load_program()
 		}
 	}

+	vertex_program.skip_vertex_input_check = true;	//not needed for us since decoding is done server side
 	auto old_program = m_program;
 	m_program = &m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, nullptr);
 	m_program->use();

 	u8 *buf;
-	u32 scale_offset_offset;
+	u32 vertex_state_offset;
 	u32 vertex_constants_offset;
 	u32 fragment_constants_offset;

@ -920,17 +925,20 @@ bool GLGSRender::load_program()

 	if (manually_flush_ring_buffers)
 	{
-		m_scale_offset_buffer->reserve_storage_on_heap(512);
+		m_vertex_state_buffer->reserve_storage_on_heap(512);
 		m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_buffer_size, 256));
 		if (m_transform_constants_dirty) m_transform_constants_buffer->reserve_storage_on_heap(8192);
 	}

-	// Scale offset
-	auto mapping = m_scale_offset_buffer->alloc_from_heap(512, m_uniform_buffer_offset_align);
+	// Vertex state
+	auto mapping = m_vertex_state_buffer->alloc_from_heap(512, m_uniform_buffer_offset_align);
 	buf = static_cast<u8*>(mapping.first);
-	scale_offset_offset = mapping.second;
+	vertex_state_offset = mapping.second;
 	fill_scale_offset_data(buf, false);
-	fill_user_clip_data((char *)buf + 64);
+	fill_user_clip_data(buf + 64);
+	*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
+	*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
+	fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 144));

 	if (m_transform_constants_dirty)
 	{
@ -939,7 +947,6 @@ bool GLGSRender::load_program()
 		buf = static_cast<u8*>(mapping.first);
 		vertex_constants_offset = mapping.second;
 		fill_vertex_program_constants_data(buf);
-		*(reinterpret_cast<u32*>(buf + (468 * 4 * sizeof(float)))) = rsx::method_registers.transform_branch_bits();
 	}

 	// Fragment constants
@ -952,21 +959,20 @@ bool GLGSRender::load_program()
 	// Fragment state
 	fill_fragment_state_buffer(buf+fragment_constants_size, fragment_program);

-	m_scale_offset_buffer->bind_range(0, scale_offset_offset, 512);
+	m_vertex_state_buffer->bind_range(0, vertex_state_offset, 512);
 	m_fragment_constants_buffer->bind_range(2, fragment_constants_offset, fragment_buffer_size);

 	if (m_transform_constants_dirty) m_transform_constants_buffer->bind_range(1, vertex_constants_offset, 8192);

 	if (manually_flush_ring_buffers)
 	{
-		m_scale_offset_buffer->unmap();
+		m_vertex_state_buffer->unmap();
 		m_fragment_constants_buffer->unmap();

 		if (m_transform_constants_dirty) m_transform_constants_buffer->unmap();
 	}

 	m_transform_constants_dirty = false;
-	return true;
 }

 void GLGSRender::flip(int buffer)
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -50,12 +50,13 @@ private:

 	gl::texture_cache m_gl_texture_cache;

-	gl::texture m_gl_attrib_buffers[rsx::limits::vertex_count];
+	gl::texture m_gl_persistent_stream_buffer;
+	gl::texture m_gl_volatile_stream_buffer;

 	std::unique_ptr<gl::ring_buffer> m_attrib_ring_buffer;
 	std::unique_ptr<gl::ring_buffer> m_fragment_constants_buffer;
 	std::unique_ptr<gl::ring_buffer> m_transform_constants_buffer;
-	std::unique_ptr<gl::ring_buffer> m_scale_offset_buffer;
+	std::unique_ptr<gl::ring_buffer> m_vertex_state_buffer;
 	std::unique_ptr<gl::ring_buffer> m_index_ring_buffer;

 	u32 m_draw_calls = 0;
@ -83,11 +84,6 @@ private:

 	bool flush_draw_buffers = false;

-	bool m_last_draw_indexed;
-	GLenum m_last_ib_type;
-	size_t m_last_index_offset;
-	u32 m_last_vertex_count;
-
 public:
 	gl::fbo draw_fbo;

@ -391,13 +387,16 @@ private:
 	gl_state;

 	// Return element to draw and in case of indexed draw index type and offset in index buffer
-	std::tuple<u32, std::optional<std::tuple<GLenum, u32> > > set_vertex_buffer();
+	std::tuple<u32, u32, u32, std::optional<std::tuple<GLenum, u32> > > set_vertex_buffer();
+	rsx::vertex_input_layout m_vertex_layout = {};

 	void clear_surface(u32 arg);
+	void init_buffers(bool skip_reading = false);
+
+	bool check_program_state();
+	void load_program(u32 vertex_base, u32 vertex_count);

 public:
-	bool load_program();
-	void init_buffers(bool skip_reading = false);
 	void read_buffers();
 	void write_buffers();
 	void set_viewport();
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -154,8 +154,17 @@ namespace gl
 			}

 			//Workaround for intel drivers which have terrible capability reporting
-			std::string vendor_string = (const char*)glGetString(GL_VENDOR);
-			std::transform(vendor_string.begin(), vendor_string.end(), vendor_string.begin(), ::tolower);
+			std::string vendor_string;
+			if (const char* raw_string = (const char*)glGetString(GL_VENDOR))
+			{
+				vendor_string = raw_string;
+				std::transform(vendor_string.begin(), vendor_string.end(), vendor_string.begin(), ::tolower);
+			}
+			else
+			{
+				LOG_ERROR(RSX, "Failed to get vendor string from driver. Are we missing a context?");
+				vendor_string = "intel"; //lowest acceptable value
+			}

 			if (vendor_string.find("intel") != std::string::npos)
 			{
@ -826,7 +835,7 @@ namespace gl
 			buffer::create();

 			glBindBuffer((GLenum)m_target, m_id);
-			glBufferStorage((GLenum)m_target, size, data, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
+			glBufferStorage((GLenum)m_target, size, data, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_CLIENT_STORAGE_BIT | GL_MAP_COHERENT_BIT);
 			m_memory_mapping = glMapBufferRange((GLenum)m_target, 0, size, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

 			verify(HERE), m_memory_mapping != nullptr;
@ -2538,7 +2547,7 @@ namespace gl
 						error_msg = buf.get();
 					}

-					throw validation_exception(error_msg);
+					LOG_ERROR(RSX, "Validation failed: %s", error_msg.c_str());
 				}
 			}

--- a/rpcs3/Emu/RSX/GL/GLProgramBuffer.h
+++ b/rpcs3/Emu/RSX/GL/GLProgramBuffer.h
@ -54,24 +54,11 @@ struct GLTraits
 				result.uniforms[location] = (i + rsx::limits::fragment_textures_count);
 		}

-		//We use texture buffers for vertex attributes. Bind these here as well
-		//as they are guaranteed to be fixed (1 to 1 mapping)
-		std::array<const char*, 16> s_reg_table =
-		{
-			"in_pos_buffer", "in_weight_buffer", "in_normal_buffer",
-			"in_diff_color_buffer", "in_spec_color_buffer",
-			"in_fog_buffer",
-			"in_point_size_buffer", "in_7_buffer",
-			"in_tc0_buffer", "in_tc1_buffer", "in_tc2_buffer", "in_tc3_buffer",
-			"in_tc4_buffer", "in_tc5_buffer", "in_tc6_buffer", "in_tc7_buffer"
-		};
+		const int stream_buffer_start = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;

-		for (int i = 0; i < rsx::limits::vertex_count; ++i)
-		{
-			int location;
-			if (result.uniforms.has_location(s_reg_table[i], &location))
-				result.uniforms[location] = (i + rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count);
-		}
+		//Bind locations 0 and 1 to the stream buffers
+		result.uniforms[0] = stream_buffer_start;
+		result.uniforms[1] = stream_buffer_start + 1;

 		LOG_NOTICE(RSX, "*** prog id = %d", result.id());
 		LOG_NOTICE(RSX, "*** vp id = %d", vertexProgramData.id);
--- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
@ -19,130 +19,6 @@ namespace

 namespace
 {
-	u32 to_gl_internal_type(rsx::vertex_base_type type, u8 size)
-	{
-		/**
-		* NOTE 1. The buffer texture spec only allows fetches aligned to 8, 16, 32, etc...
-		* This rules out most 3-component formats, except for the 32-wide RGB32F, RGB32I, RGB32UI
-		*
-		* NOTE 2. While s1 & cmp types are signed normalized 16-bit integers, some GPU vendors dont support texture buffer access
-		* using these formats. Pass a 16 bit unnormalized integer and convert it in the vertex shader
-		*/
-		const u32 vec1_types[] = { GL_R16I, GL_R32F, GL_R16F, GL_R8, GL_R16I, GL_RGBA16I, GL_R8UI };
-		const u32 vec2_types[] = { GL_RG16I, GL_RG32F, GL_RG16F, GL_RG8, GL_RG16I, GL_RGBA16I, GL_RG8UI };
-		const u32 vec3_types[] = { GL_RGBA16I, GL_RGB32F, GL_RGBA16F, GL_RGBA8, GL_RGBA16I, GL_RGBA16I, GL_RGBA8UI };	//VEC3 COMPONENTS NOT SUPPORTED!
-		const u32 vec4_types[] = { GL_RGBA16I, GL_RGBA32F, GL_RGBA16F, GL_RGBA8, GL_RGBA16I, GL_RGBA16I, GL_RGBA8UI };
-
-		const u32* vec_selectors[] = { 0, vec1_types, vec2_types, vec3_types, vec4_types };
-
-		if (type > rsx::vertex_base_type::ub256)
-			fmt::throw_exception("OpenGL error: unknown vertex base type 0x%x" HERE, (u32)type);
-
-		return vec_selectors[size][(int)type];
-	}
-
-	void prepare_buffer_for_writing(void *data, rsx::vertex_base_type type, u8 vertex_size, u32 vertex_count)
-	{
-		switch (type)
-		{
-		case rsx::vertex_base_type::sf:
-		{
-			if (vertex_size == 3)
-			{
-				/**
-				* Pad the 4th component for half-float arrays to 1, since texelfetch does not mask components
-				*/
-				u16 *dst = reinterpret_cast<u16*>(data);
-				for (u32 i = 0, idx = 3; i < vertex_count; ++i, idx += 4)
-					dst[idx] = 0x3c00;
-			}
-
-			break;
-		}
-		}
-	}
-
-	template<typename T, int count>
-	struct apply_attrib_t;
-
-	template<typename T>
-	struct apply_attrib_t<T, 1>
-	{
-		static void func(gl::glsl::program& program, int location, const T* data)
-		{
-			program.attribs[location] = data[0];
-		}
-	};
-
-	template<typename T>
-	struct apply_attrib_t<T, 2>
-	{
-		static void func(gl::glsl::program& program, int location, const T* data)
-		{
-			program.attribs[location] = color2_base<T>{ data[0], data[1] };
-		}
-	};
-
-	template<typename T>
-	struct apply_attrib_t<T, 3>
-	{
-		static void func(gl::glsl::program& program, int location, const T* data)
-		{
-			program.attribs[location] = color3_base<T>{ data[0], data[1], data[2] };
-		}
-	};
-
-	template<typename T>
-	struct apply_attrib_t<T, 4>
-	{
-		static void func(gl::glsl::program& program, int location, const T* data)
-		{
-			program.attribs[location] = color4_base<T>{ data[0], data[1], data[2], data[3] };
-		}
-	};
-
-
-	template<typename T, int count>
-	void apply_attrib_array(gl::glsl::program& program, int location, const std::vector<u8>& data)
-	{
-		for (size_t offset = 0; offset < data.size(); offset += count * sizeof(T))
-		{
-			apply_attrib_t<T, count>::func(program, location, (T*)(data.data() + offset));
-		}
-	}
-
-	gl::buffer_pointer::type gl_types(rsx::vertex_base_type type)
-	{
-		switch (type)
-		{
-		case rsx::vertex_base_type::s1: return gl::buffer_pointer::type::s16;
-		case rsx::vertex_base_type::f: return gl::buffer_pointer::type::f32;
-		case rsx::vertex_base_type::sf: return gl::buffer_pointer::type::f16;
-		case rsx::vertex_base_type::ub: return gl::buffer_pointer::type::u8;
-		case rsx::vertex_base_type::s32k: return gl::buffer_pointer::type::s32;
-		case rsx::vertex_base_type::cmp: return gl::buffer_pointer::type::s16; // Needs conversion
-		case rsx::vertex_base_type::ub256: gl::buffer_pointer::type::u8;
-		}
-		fmt::throw_exception("unknown vertex type" HERE);
-	}
-
-	bool gl_normalized(rsx::vertex_base_type type)
-	{
-		switch (type)
-		{
-		case rsx::vertex_base_type::s1:
-		case rsx::vertex_base_type::ub:
-		case rsx::vertex_base_type::cmp:
-			return true;
-		case rsx::vertex_base_type::f:
-		case rsx::vertex_base_type::sf:
-		case rsx::vertex_base_type::ub256:
-		case rsx::vertex_base_type::s32k:
-			return false;
-		}
-		fmt::throw_exception("unknown vertex type" HERE);
-	}
-
 	// return vertex count if primitive type is not native (empty array otherwise)
 	std::tuple<u32, u32> get_index_array_for_emulated_non_indexed_draw(const std::vector<std::pair<u32, u32>> &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst)
 	{
@ -195,85 +71,13 @@ namespace
 		throw;
 	}

-
-	struct vertex_buffer_visitor
+	struct vertex_input_state
 	{
-		vertex_buffer_visitor(u32 vtx_cnt, gl::ring_buffer& heap, gl::glsl::program* prog, gl::texture* attrib_buffer, u32 min_texbuffer_offset, gl::vertex_cache* vertex_cache)
-		    : vertex_count(vtx_cnt)
-		    , m_attrib_ring_info(heap)
-		    , m_program(prog)
-		    , m_gl_attrib_buffers(attrib_buffer)
-		    , m_min_texbuffer_alignment(min_texbuffer_offset)
-			, m_vertex_cache(vertex_cache)
-		{
-		}
-
-		void operator()(const rsx::vertex_array_buffer& vertex_array)
-		{
-			int location;
-			if (!m_program->uniforms.has_location(s_reg_table[vertex_array.index], &location))
-				return;
-
-			GLenum gl_type = to_gl_internal_type(vertex_array.type, vertex_array.attribute_size);
-			auto& texture = m_gl_attrib_buffers[vertex_array.index];
-			const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
-			const u32 data_size = vertex_count * element_size;
-
-			const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
-			u32 buffer_offset = 0;
-
-			if (auto uploaded = m_vertex_cache->find_vertex_range(local_addr, gl_type, data_size))
-			{
-				buffer_offset = uploaded->offset_in_heap;
-			}
-			else
-			{
-				// Fill vertex_array
-				auto mapping = m_attrib_ring_info.alloc_from_heap(data_size, m_min_texbuffer_alignment);
-				gsl::byte* dst = static_cast<gsl::byte*>(mapping.first);
-				buffer_offset = mapping.second;
-				gsl::span<gsl::byte> dest_span(dst, data_size);
-
-				m_vertex_cache->store_range(local_addr, gl_type, data_size, buffer_offset);
-				write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size));
-				prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
-			}
-
-			texture.copy_from(m_attrib_ring_info, gl_type, buffer_offset, data_size);
-		}
-
-		void operator()(const rsx::vertex_array_register& vertex_register)
-		{
-			int location;
-			if (!m_program->uniforms.has_location(s_reg_table[vertex_register.index], &location))
-				return;
-
-			const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_register.type, vertex_register.attribute_size);
-			const u32 gl_type   = to_gl_internal_type(vertex_register.type, vertex_register.attribute_size);
-			const u32 data_size = element_size;
-
-			auto& texture = m_gl_attrib_buffers[vertex_register.index];
-
-			auto mapping = m_attrib_ring_info.alloc_from_heap(data_size, m_min_texbuffer_alignment);
-			u8 *dst = static_cast<u8*>(mapping.first);
-
-			memcpy(dst, vertex_register.data.data(), element_size);
-			prepare_buffer_for_writing(dst, vertex_register.type, vertex_register.attribute_size, vertex_count);
-
-			texture.copy_from(m_attrib_ring_info, gl_type, mapping.second, data_size);
-		}
-
-		void operator()(const rsx::empty_vertex_array& vbo)
-		{
-		}
-
-	protected:
-		u32 vertex_count;
-		gl::ring_buffer& m_attrib_ring_info;
-		gl::glsl::program* m_program;
-		gl::texture* m_gl_attrib_buffers;
-		GLint m_min_texbuffer_alignment;
-		gl::vertex_cache* m_vertex_cache;
+		u32 vertex_draw_count;
+		u32 allocated_vertex_count;
+		u32 vertex_data_base;
+		u32 vertex_index_base;
+		std::optional<std::tuple<GLenum, u32>> index_info;
 	};

 	struct draw_command_visitor
@ -281,54 +85,32 @@ namespace
 		using attribute_storage = std::vector<
 		    std::variant<rsx::vertex_array_buffer, rsx::vertex_array_register, rsx::empty_vertex_array>>;

-		draw_command_visitor(gl::ring_buffer& index_ring_buffer, gl::ring_buffer& attrib_ring_buffer,
-		    gl::texture* gl_attrib_buffers, gl::glsl::program* program, GLint min_texbuffer_alignment,
-			gl::vertex_cache* vertex_cache,
-		    std::function<attribute_storage(rsx::rsx_state, std::vector<std::pair<u32, u32>>)> gvb)
+		draw_command_visitor(gl::ring_buffer& index_ring_buffer, rsx::vertex_input_layout& vertex_layout)
 		    : m_index_ring_buffer(index_ring_buffer)
-		    , m_attrib_ring_buffer(attrib_ring_buffer)
-		    , m_gl_attrib_buffers(gl_attrib_buffers)
-		    , m_program(program)
-		    , m_min_texbuffer_alignment(min_texbuffer_alignment)
-		    , get_vertex_buffers(gvb)
-			, m_vertex_cache(vertex_cache)
-		{
-			for (u8 index = 0; index < rsx::limits::vertex_count; ++index) {
-				if (rsx::method_registers.vertex_arrays_info[index].size() ||
-				    rsx::method_registers.register_vertex_info[index].size)
-				{
-					max_vertex_attrib_size += 16;
-				}
-			}
-		}
+			, m_vertex_layout(vertex_layout)
+		{}

-		std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
-		    const rsx::draw_array_command& command)
+		vertex_input_state operator()(const rsx::draw_array_command& command)
 		{
 			u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
 			u32 min_index    = rsx::method_registers.current_draw_clause.first_count_commands.front().first;
 			u32 max_index    = vertex_count - 1 + min_index;

-			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) {
+			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
+			{
 				u32 index_count;
 				u32 offset_in_index_buffer;
 				std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
 				    rsx::method_registers.current_draw_clause.first_count_commands,
 				    rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);

-				upload_vertex_buffers(min_index, max_index, max_vertex_attrib_size);
-
-				return std::make_tuple(index_count,
-				    std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer));
+				return{ index_count, vertex_count, min_index, 0, std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer) };
 			}

-			upload_vertex_buffers(min_index, max_index, max_vertex_attrib_size);
-
-			return std::make_tuple(vertex_count, std::optional<std::tuple<GLenum, u32>>());
+			return{ vertex_count, vertex_count, min_index, 0, std::optional<std::tuple<GLenum, u32>>() };
 		}

-		std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
-		    const rsx::draw_indexed_array_command& command)
+		vertex_input_state operator()(const rsx::draw_indexed_array_command& command)
 		{
 			u32 min_index = 0, max_index = 0;

@ -338,7 +120,7 @@ namespace
 			
 			u32 type_size              = ::narrow<u32>(get_index_type_size(type));

-			u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
+			const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
 			u32 index_count = vertex_count;
 			
 			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
@ -352,129 +134,114 @@ namespace
 			std::tie(min_index, max_index, index_count) = upload_index_buffer(
 			    command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive,
 			    rsx::method_registers.current_draw_clause.first_count_commands, vertex_count);
-			
-			upload_vertex_buffers(0, max_index, max_vertex_attrib_size);

-			return std::make_tuple(index_count, std::make_tuple(get_index_type(type), offset_in_index_buffer));
+			//check for vertex arrays with frquency modifiers
+			for (auto &block : m_vertex_layout.interleaved_blocks)
+			{
+				if (block.min_divisor > 1)
+				{
+					//Ignore base offsets and return real results
+					//The upload function will optimize the uploaded range anyway
+					return{ index_count, max_index, 0, 0, std::make_tuple(get_index_type(type), offset_in_index_buffer) };
+				}
+			}
+
+			//Prefer only reading the vertices that are referenced in the index buffer itself
+			//Offset data source by min_index verts, but also notify the shader to offset the vertexID
+			return{ index_count, (max_index - min_index + 1), min_index, min_index, std::make_tuple(get_index_type(type), offset_in_index_buffer) };
 		}

-		std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> operator()(
-		    const rsx::draw_inlined_array& command)
+		vertex_input_state operator()(const rsx::draw_inlined_array& command)
 		{
-			// We need to go through array to determine vertex count so upload it
-			u32 vertex_count = upload_inline_array(max_vertex_attrib_size);
+			u32 vertex_count = (u32)command.inline_vertex_array.size() * sizeof(u32) / m_vertex_layout.interleaved_blocks[0].attribute_stride;

-			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) {
+			if (!gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive))
+			{
 				u32 offset_in_index_buffer;
 				u32 index_count;
 				std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
 					{ std::make_pair(0, vertex_count) },
-				    rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
-				return std::make_tuple(index_count,
-				    std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer));
+					rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
+
+				return{ index_count, vertex_count, 0, 0, std::make_tuple(static_cast<GLenum>(GL_UNSIGNED_SHORT), offset_in_index_buffer) };
 			}
-			return std::make_tuple(vertex_count, std::optional<std::tuple<GLenum, u32>>());
+
+			return{ vertex_count, vertex_count, 0, 0, std::optional<std::tuple<GLenum, u32>>() };
 		}

 	private:
-		u32 max_vertex_attrib_size = 0;
 		gl::ring_buffer& m_index_ring_buffer;
-		gl::ring_buffer& m_attrib_ring_buffer;
-		gl::texture* m_gl_attrib_buffers;
-		gl::vertex_cache* m_vertex_cache;
-		gl::glsl::program* m_program;
-		GLint m_min_texbuffer_alignment;
-		std::function<attribute_storage(rsx::rsx_state, std::vector<std::pair<u32, u32>>)>
-		    get_vertex_buffers;
-
-		void upload_vertex_buffers(u32 min_index, u32 max_index, const u32& max_vertex_attrib_size)
-		{
-			u32 verts_allocated = max_index - min_index + 1;
-
-			vertex_buffer_visitor visitor(verts_allocated, m_attrib_ring_buffer,
-			    m_program, m_gl_attrib_buffers, m_min_texbuffer_alignment, m_vertex_cache);
-			const auto& vertex_buffers =
-			    get_vertex_buffers(rsx::method_registers, {{min_index, verts_allocated}});
-			for (const auto& vbo : vertex_buffers) std::apply_visitor(visitor, vbo);
-		}
-
-		u32 upload_inline_array(const u32& max_vertex_attrib_size)
-		{
-			u32 stride                             = 0;
-			u32 offsets[rsx::limits::vertex_count] = {0};
-
-			for (u32 i = 0; i < rsx::limits::vertex_count; ++i) {
-				const auto& info = rsx::method_registers.vertex_arrays_info[i];
-				if (!info.size()) continue;
-
-				offsets[i] = stride;
-				stride += rsx::get_vertex_type_size_on_host(info.type(), info.size());
-			}
-
-			u32 vertex_draw_count =
-			    (u32)(rsx::method_registers.current_draw_clause.inline_vertex_array.size() * sizeof(u32)) /
-			    stride;
-
-			for (int index = 0; index < rsx::limits::vertex_count; ++index) {
-				auto& vertex_info = rsx::method_registers.vertex_arrays_info[index];
-
-				int location;
-				if (!m_program->uniforms.has_location(s_reg_table[index], &location)) continue;
-
-				if (!vertex_info.size())
-					continue;
-
-				const u32 element_size =
-				    rsx::get_vertex_type_size_on_host(vertex_info.type(), vertex_info.size());
-				u32 data_size = element_size * vertex_draw_count;
-				u32 gl_type   = to_gl_internal_type(vertex_info.type(), vertex_info.size());
-
-				auto& texture = m_gl_attrib_buffers[index];
-
-				u8* src =
-				    reinterpret_cast<u8*>(rsx::method_registers.current_draw_clause.inline_vertex_array.data());
-				auto mapping = m_attrib_ring_buffer.alloc_from_heap(data_size, m_min_texbuffer_alignment);
-				u8* dst      = static_cast<u8*>(mapping.first);
-
-				src += offsets[index];
-				prepare_buffer_for_writing(dst, vertex_info.type(), vertex_info.size(), vertex_draw_count);
-
-				// TODO: properly handle compressed data
-				for (u32 i = 0; i < vertex_draw_count; ++i) {
-					if (vertex_info.type() == rsx::vertex_base_type::ub && vertex_info.size() == 4) {
-						dst[0] = src[3];
-						dst[1] = src[2];
-						dst[2] = src[1];
-						dst[3] = src[0];
-					}
-					else
-						memcpy(dst, src, element_size);
-
-					src += stride;
-					dst += element_size;
-				}
-
-				texture.copy_from(m_attrib_ring_buffer, gl_type, mapping.second, data_size);
-			}
-			return vertex_draw_count;
-		}
+		rsx::vertex_input_layout& m_vertex_layout;
 	};
 }

-std::tuple<u32, std::optional<std::tuple<GLenum, u32>>> GLGSRender::set_vertex_buffer()
+std::tuple<u32, u32, u32, std::optional<std::tuple<GLenum, u32>>> GLGSRender::set_vertex_buffer()
 {
 	std::chrono::time_point<steady_clock> then = steady_clock::now();
-	auto result = std::apply_visitor(draw_command_visitor(*m_index_ring_buffer, *m_attrib_ring_buffer,
-		m_gl_attrib_buffers, m_program, m_min_texbuffer_alignment,
-		m_vertex_cache.get(),
-		[this](const auto& state, const auto& list) {
-			return this->get_vertex_buffers(state, list, 0);
-		}),
-		get_draw_command(rsx::method_registers));
+
+	m_vertex_layout = analyse_inputs_interleaved();
+
+	//Write index buffers and count verts
+	auto result = std::apply_visitor(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), get_draw_command(rsx::method_registers));
+
+	auto &vertex_count = result.allocated_vertex_count;
+	auto &vertex_base = result.vertex_data_base;
+
+	//Do actual vertex upload
+	auto &required = calculate_memory_requirements(m_vertex_layout, vertex_count);
+
+
+	std::pair<void*, u32> persistent_mapping = {}, volatile_mapping = {};
+
+	if (required.first > 0)
+	{
+		//Check if cacheable
+		//Only data in the 'persistent' block may be cached
+		//TODO: make vertex cache keep local data beyond frame boundaries and hook notify command
+		bool in_cache = false;
+		bool to_store = false;
+		u32  storage_address = UINT32_MAX;
+
+		if (m_vertex_layout.interleaved_blocks.size() == 1 &&
+			rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array)
+		{
+			storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base;
+			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first))
+			{
+				in_cache = true;
+				m_gl_persistent_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, cached->offset_in_heap, required.first);
+			}
+			else
+			{
+				to_store = true;
+			}
+		}
+
+		if (!in_cache)
+		{
+			persistent_mapping = m_attrib_ring_buffer->alloc_from_heap(required.first, m_min_texbuffer_alignment);
+			m_gl_persistent_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, persistent_mapping.second, required.first);
+
+			if (to_store)
+			{
+				//store ref in vertex cache
+				m_vertex_cache->store_range(storage_address, GL_R8UI, required.first, persistent_mapping.second);
+			}
+		}
+	}
+
+	if (required.second > 0)
+	{
+		volatile_mapping = m_attrib_ring_buffer->alloc_from_heap(required.second, m_min_texbuffer_alignment);
+		m_gl_volatile_stream_buffer.copy_from(*m_attrib_ring_buffer, GL_R8UI, volatile_mapping.second, required.second);
+	}
+
+	//Write all the data
+	write_vertex_data_to_memory(m_vertex_layout, vertex_base, vertex_count, persistent_mapping.first, volatile_mapping.first);

 	std::chrono::time_point<steady_clock> now = steady_clock::now();
 	m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(now - then).count();
-	return result;
+	return std::make_tuple(result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, result.index_info);
 }

 namespace
--- a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp
@ -31,57 +31,21 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
 void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
 {
 	OS << "#version 430\n\n";
-	OS << "layout(std140, binding = 0) uniform ScaleOffsetBuffer\n";
+	OS << "layout(std140, binding = 0) uniform VertexContextBuffer\n";
 	OS << "{\n";
-	OS << "	mat4 scaleOffsetMat;\n";
-	OS << "	ivec4 userClipEnabled[2];\n";
-	OS << "	vec4 userClipFactor[2];\n";
-	OS << "};\n";
+	OS << "	mat4 scale_offset_mat;\n";
+	OS << "	ivec4 user_clip_enabled[2];\n";
+	OS << "	vec4 user_clip_factor[2];\n";
+	OS << "	uint transform_branch_bits;\n";
+	OS << "	uint vertex_base_index;\n";
+	OS << "	ivec4 input_attributes[16];\n";
+	OS << "};\n\n";
 }

 void GLVertexDecompilerThread::insertInputs(std::stringstream & OS, const std::vector<ParamType>& inputs)
 {
-	std::vector<std::tuple<size_t, std::string>> input_data;
-	for (const ParamType &PT : inputs)
-	{
-		for (const ParamItem &PI : PT.items)
-		{
-			input_data.push_back(std::make_tuple(PI.location, PI.name));
-		}
-	}
-
-	/**
-	 * Its is important that the locations are in the order that vertex attributes are expected.
-	 * If order is not adhered to, channels may be swapped leading to corruption
-	*/
-
-	std::sort(input_data.begin(), input_data.end());
-
-	int location = 1;
-	for (const std::tuple<size_t, std::string>& item : input_data)
-	{
-		for (const ParamType &PT : inputs)
-		{
-			for (const ParamItem &PI : PT.items)
-			{
-				if (PI.name == std::get<1>(item))
-				{
-					bool is_int = false;
-					for (const auto &attrib : rsx_vertex_program.rsx_vertex_inputs)
-					{
-						if (attrib.location == std::get<0>(item))
-						{
-							if (attrib.int_type || attrib.flags & GL_VP_SINT_MASK) is_int = true;
-							break;
-						}
-					}
-
-					std::string samplerType = is_int ? "isamplerBuffer" : "samplerBuffer";
-					OS << "layout(location=" << location++ << ")" << "	uniform " << samplerType << " " << PI.name << "_buffer;\n";
-				}
-			}
-		}
-	}
+	OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n";    //Data stream with persistent vertex data (cacheable)
+	OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n";      //Data stream with per-draw data (registers and immediate draw data)
 }

 void GLVertexDecompilerThread::insertConstants(std::stringstream & OS, const std::vector<ParamType> & constants)
@ -89,7 +53,6 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream & OS, const std
 	OS << "layout(std140, binding = 1) uniform VertexConstantsBuffer\n";
 	OS << "{\n";
 	OS << "	vec4 vc[468];\n";
-	OS << "	uint transform_branch_bits;\n";
 	OS << "};\n\n";

 	for (const ParamType &PT: constants)
@ -115,13 +78,13 @@ static const vertex_reg_info reg_table[] =
 	//Fog output shares a data source register with clip planes 0-2 so only declare when specified
 	{ "fog_c", true, "dst_reg5", ".xxxx", true, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FOG },
 	//Warning: Always define all 3 clip plane groups together to avoid flickering with openGL
-	{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * userClipFactor[0].x", false, "userClipEnabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
-	{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * userClipFactor[0].y", false, "userClipEnabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
-	{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * userClipFactor[0].z", false, "userClipEnabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
+	{ "gl_ClipDistance[0]", false, "dst_reg5", ".y * user_clip_factor[0].x", false, "user_clip_enabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
+	{ "gl_ClipDistance[1]", false, "dst_reg5", ".z * user_clip_factor[0].y", false, "user_clip_enabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
+	{ "gl_ClipDistance[2]", false, "dst_reg5", ".w * user_clip_factor[0].z", false, "user_clip_enabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 },
 	{ "gl_PointSize", false, "dst_reg6", ".x", false },
-	{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * userClipFactor[0].w", false, "userClipEnabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
-	{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * userClipFactor[1].x", false, "userClipEnabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
-	{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * userClipFactor[1].y", false, "userClipEnabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
+	{ "gl_ClipDistance[3]", false, "dst_reg6", ".y * user_clip_factor[0].w", false, "user_clip_enabled[0].w > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
+	{ "gl_ClipDistance[4]", false, "dst_reg6", ".z * user_clip_factor[1].x", false, "user_clip_enabled[1].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
+	{ "gl_ClipDistance[5]", false, "dst_reg6", ".w * user_clip_factor[1].y", false, "user_clip_enabled[1].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC3 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC4 | CELL_GCM_ATTRIB_OUTPUT_MASK_UC5 },
 	{ "tc0", true, "dst_reg7", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX0 },
 	{ "tc1", true, "dst_reg8", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX1 },
 	{ "tc2", true, "dst_reg9", "", false, "", "", "", false, CELL_GCM_ATTRIB_OUTPUT_MASK_TEX2 },
@ -206,57 +169,175 @@ namespace
 		}
 	}

-	void add_input(std::stringstream & OS, const ParamItem &PI, const std::vector<rsx_vertex_input> &inputs)
+	void insert_vertex_input_fetch(std::stringstream& OS)
 	{
-		for (const auto &real_input : inputs)
-		{
-			if (real_input.location != PI.location)
-				continue;
+		//Actually decode a vertex attribute from a raw byte stream
+		OS << "struct attribute_desc\n";
+		OS << "{\n";
+		OS << "	int type;\n";
+		OS << "	int attribute_size;\n";
+		OS << "	int starting_offset;\n";
+		OS << "	int stride;\n";
+		OS << "	int swap_bytes;\n";
+		OS << "	int is_volatile;\n";
+		OS << "	int frequency;\n";
+		OS << "	int divisor;\n";
+		OS << "	int modulo;\n";
+		OS << "};\n\n";

-			std::string vecType = "	vec4 ";
-			if (real_input.int_type)
-				vecType = "	ivec4 ";
+		OS << "uint get_bits(uvec4 v, int swap)\n";
+		OS << "{\n";
+		OS << "	if (swap != 0) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);\n";
+		OS << "	return (v.x | v.y << 8 | v.z << 16 | v.w << 24);\n";
+		OS << "}\n\n";

-			std::string scale = "";
-			if (real_input.flags & GL_VP_SINT_MASK)
-			{
-				if (real_input.flags & GL_VP_ATTRIB_S16_INT)
-					scale = " / " + expand_to_vec4("32767.", real_input.size);
-				else
-					scale = " / " + expand_to_vec4("2147483647.", real_input.size);
-			}
+		OS << "uint get_bits(uvec2 v, int swap)\n";
+		OS << "{\n";
+		OS << "	if (swap != 0) return (v.y | v.x << 8);\n";
+		OS << "	return (v.x | v.y << 8);\n";
+		OS << "}\n\n";

-			if (!real_input.is_array)
-			{
-				OS << vecType << PI.name << " = texelFetch(" << PI.name << "_buffer, 0)" << scale << ";\n";
-				return;
-			}
+		OS << "int preserve_sign_s16(uint bits)\n";
+		OS << "{\n";
+		OS << "	//convert raw 16 bit value into signed 32-bit integer counterpart\n";
+		OS << "	uint sign = bits & 0x8000;\n";
+		OS << "	if (sign != 0) return int(bits | 0xFFFF0000);\n";
+		OS << "	return int(bits);\n";
+		OS << "}\n\n";

-			if (real_input.frequency > 1)
-			{
-				if (real_input.is_modulo)
-				{
-					OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID %" << real_input.frequency << ")" << scale << ";\n";
-					return;
-				}
+		OS << "float convert_to_f32(uint bits)\n";
+		OS << "{\n";
+		OS << "	uint sign = (bits >> 31) & 1;\n";
+		OS << "	uint exp = (bits >> 23) & 0xff;\n";
+		OS << "	uint mantissa = bits & 0x7fffff;\n";
+		OS << "	float base = (sign != 0)? -1.f: 1.f;\n";
+		OS << "	base *= exp2(exp - 127);\n";
+		OS << "	float scale = 0.f;\n\n";
+		OS << "	for (int x = 0; x < 23; x++)\n";
+		OS << "	{\n";
+		OS << "		int inv = (22 - x);\n";
+		OS << "		if ((mantissa & (1 << inv)) == 0) continue;\n";
+		OS << "		scale += 1.f / pow(2.f, float(inv));\n";
+		OS << "	}\n";
+		OS << "	return base * scale;\n";
+		OS << "}\n";

-				OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID /" << real_input.frequency << ")" << scale << ";\n";
-				return;
-			}
+		OS << "#define get_s16(v, s) preserve_sign_s16(get_bits(v, s))\n\n";

-			OS << vecType << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID)" << scale << ";\n";
-			return;
-		}
+		OS << "vec4 fetch_attribute(attribute_desc desc, int vertex_id, usamplerBuffer input_stream)\n";
+		OS << "{\n";
+		OS << "	vec4 result = vec4(0., 0., 0., 1.);\n";
+		OS << "	vec4 scale = vec4(1.);\n";
+		OS << "	uvec4 tmp;\n";
+		OS << "	uint bits;\n";
+		OS << "	bool reverse_order = false;\n";
+		OS << "\n";
+		OS << "	int first_byte = (vertex_id * desc.stride) + desc.starting_offset;\n";
+		OS << "	for (int n = 0; n < desc.attribute_size; n++)\n";
+		OS << "	{\n";
+		OS << "		switch (desc.type)\n";
+		OS << "		{\n";
+		OS << "		case 0:\n";
+		OS << "			//signed normalized 16-bit\n";
+		OS << "			tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			result[n] = get_s16(tmp.xy, desc.swap_bytes);\n";
+		OS << "			scale[n] = 32767.;\n";
+		OS << "			break;\n";
+		OS << "		case 1:\n";
+		OS << "			//float\n";
+		OS << "			tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[2] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[3] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			result[n] = uintBitsToFloat(get_bits(tmp, desc.swap_bytes));\n";
+		OS << "			break;\n";
+		OS << "		case 2:\n";
+		OS << "			//half\n";
+		OS << "			tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			result[n] = unpackHalf2x16(uint(get_bits(tmp.xy, desc.swap_bytes))).x;\n";
+		OS << "			break;\n";
+		OS << "		case 3:\n";
+		OS << "			//unsigned byte\n";
+		OS << "			result[n] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			scale[n] = 255.;\n";
+		OS << "			reverse_order = (desc.swap_bytes != 0);\n";
+		OS << "			break;\n";
+		OS << "		case 4:\n";
+		OS << "			//signed word\n";
+		OS << "			tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			result[n] = get_s16(tmp.xy, desc.swap_bytes);\n";
+		OS << "			break;\n";
+		OS << "		case 5:\n";
+		OS << "			//cmp\n";
+		OS << "			tmp[0] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[1] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[2] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			tmp[3] = texelFetch(input_stream, first_byte++).x;\n";
+		OS << "			bits = get_bits(tmp, desc.swap_bytes);\n";
+		OS << "			result.x = preserve_sign_s16((bits & 0x7FF) << 5);\n";
+		OS << "			result.y = preserve_sign_s16(((bits >> 11) & 0x7FF) << 5);\n";
+		OS << "			result.z = preserve_sign_s16(((bits >> 22) & 0x3FF) << 6);\n";
+		OS << "			result.w = 1.;\n";
+		OS << "			scale = vec4(32767., 32767., 32767., 1.);\n";
+		OS << "			break;\n";
+		OS << "		case 6:\n";
+		OS << "			//ub256\n";
+		OS << "			result[n] = float(texelFetch(input_stream, first_byte++).x);\n";
+		OS << "			reverse_order = (desc.swap_bytes != 0);\n";
+		OS << "			break;\n";
+		OS << "		}\n";
+		OS << "	}\n\n";
+		OS << "	result /= scale;\n";
+		OS << "	return (reverse_order)? result.wzyx: result;\n";
+		OS << "}\n\n";

-		LOG_WARNING(RSX, "Vertex input %s does not have a matching vertex_input declaration", PI.name.c_str());
+		OS << "attribute_desc fetch_desc(int location)\n";
+		OS << "{\n";
+		OS << "	attribute_desc result;\n";
+		OS << "	int attribute_flags = input_attributes[location].w;\n";
+		OS << "	result.type = input_attributes[location].x;\n";
+		OS << "	result.attribute_size = input_attributes[location].y;\n";
+		OS << "	result.starting_offset = input_attributes[location].z;\n";
+		OS << "	result.stride = attribute_flags & 0xFF;\n";
+		OS << "	result.swap_bytes = (attribute_flags >> 8) & 0x1;\n";
+		OS << "	result.is_volatile = (attribute_flags >> 9) & 0x1;\n";
+		OS << "	result.frequency = (attribute_flags >> 10) & 0x3;\n";
+		OS << "	result.modulo = (attribute_flags >> 12) & 0x1;\n";
+		OS << "	result.divisor = (attribute_flags >> 13) & 0xFFFF;\n";
+		OS << "	return result;\n";
+		OS << "}\n\n";

-		OS << "	vec4 " << PI.name << "= texelFetch(" << PI.name << "_buffer, gl_VertexID);\n";
+		OS << "vec4 read_location(int location)\n";
+		OS << "{\n";
+		OS << "	attribute_desc desc = fetch_desc(location);\n";
+		OS << "\n";
+		OS << "	int vertex_id = gl_VertexID - int(vertex_base_index);\n";
+		OS << "	if (desc.frequency == 0)\n";
+		OS << "		vertex_id = 0;\n";
+		OS << "	else if (desc.frequency > 1)\n";
+		OS << "	{\n";
+		OS << "		//if a vertex modifier is active; vertex_base must be 0 and is ignored\n";
+		OS << "		if (desc.modulo != 0)\n";
+		OS << "			vertex_id = gl_VertexID % desc.divisor;\n";
+		OS << "		else\n";
+		OS << "			vertex_id = gl_VertexID / desc.divisor;\n";
+		OS << "	}\n";
+		OS << "\n";
+		OS << "	if (desc.is_volatile != 0)\n";
+		OS << "		return fetch_attribute(desc, vertex_id, volatile_input_stream);\n";
+		OS << "	else\n";
+		OS << "		return fetch_attribute(desc, vertex_id, persistent_input_stream);\n";
+		OS << "}\n\n";
 	}
 }

 void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 {
 	insert_glsl_legacy_function(OS, gl::glsl::glsl_vertex_program);
+	insert_vertex_input_fetch(OS);

 	std::string parameters = "";
 	for (int i = 0; i < 16; ++i)
@ -293,7 +374,9 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
 	for (const ParamType &PT : m_parr.params[PF_PARAM_IN])
 	{
 		for (const ParamItem &PI : PT.items)
-			add_input(OS, PI, rsx_vertex_program.rsx_vertex_inputs);
+		{
+			OS << "	vec4 " << PI.name << "= read_location(" << std::to_string(PI.location) << ");\n";
+		}
 	}

 	for (const ParamType &PT : m_parr.params[PF_PARAM_UNIFORM])
@ -401,7 +484,7 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS)
 		if (m_parr.HasParam(PF_PARAM_NONE, "vec4", "dst_reg2"))
 			OS << "	front_spec_color = dst_reg2;\n";

-	OS << "	gl_Position = gl_Position * scaleOffsetMat;\n";
+	OS << "	gl_Position = gl_Position * scale_offset_mat;\n";

 	//Since our clip_space is symetrical [-1, 1] we map it to linear space using the eqn:
 	//ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer
--- a/rpcs3/Emu/RSX/GL/GLVertexProgram.h
+++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.h
@ -30,6 +30,7 @@ protected:
 	virtual void insertMainEnd(std::stringstream &OS) override;

 	const RSXVertexProgram &rsx_vertex_program;
+	std::unordered_map<std::string, int> input_locations;
 public:
 	GLVertexDecompilerThread(const RSXVertexProgram &prog, std::string& shader, ParamArray&)
 		: VertexProgramDecompiler(prog)
@ -50,6 +51,7 @@ public:
 	ParamArray parr;
 	u32 id = 0;
 	std::string shader;
+	bool interleaved;

 	void Decompile(const RSXVertexProgram& prog);
 	void Compile();
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -125,7 +125,7 @@ namespace rsx
 				return sizeof(u8) * 4;
 			}
 			fmt::throw_exception("Wrong vector size" HERE);
-		case vertex_base_type::cmp: return sizeof(u16) * 4;
+		case vertex_base_type::cmp: return 4;
 		case vertex_base_type::ub256: verify(HERE), (size == 4); return sizeof(u8) * 4;
 		}
 		fmt::throw_exception("RSXVertexData::GetTypeSize: Bad vertex data type (%d)!" HERE, (u8)type);
@ -309,32 +309,6 @@ namespace rsx
 		return (u32)element_push_buffer.size();
 	}

-	bool thread::is_probable_instanced_draw()
-	{
-		if (!g_cfg.video.batch_instanced_geometry)
-			return false;
-
-		//If the array registers have not been touched, the index array has also not been touched via notify or via register set, its likely an instanced draw
-		//gcm lib will set the registers once and then call the same draw command over and over with different transform params to achieve this
-		if (m_index_buffer_changed || m_vertex_attribs_changed)
-			return false;
-
-		auto& draw_clause = rsx::method_registers.current_draw_clause;
-		if (draw_clause.command != m_last_command)
-			return false;
-
-		if (draw_clause.command != rsx::draw_command::inlined_array)
-		{
-			if (draw_clause.first_count_commands.back().second != m_last_first_count.second ||
-				draw_clause.first_count_commands.front().first != m_last_first_count.first)
-				return false;
-		}
-		else if (m_last_first_count.second != draw_clause.inline_vertex_array.size())
-			return false;
-
-		return true;
-	}
-
 	void thread::end()
 	{
 		rsx::method_registers.transform_constants.clear();
@ -355,17 +329,6 @@ namespace rsx
 			u32 element_count = rsx::method_registers.current_draw_clause.get_elements_count();
 			capture_frame("Draw " + rsx::to_string(rsx::method_registers.current_draw_clause.primitive) + std::to_string(element_count));
 		}
-
-		auto& clause = rsx::method_registers.current_draw_clause;
-		
-		m_last_command = clause.command;
-		if (m_last_command == rsx::draw_command::inlined_array)
-			m_last_first_count = std::make_pair(0, (u32)clause.inline_vertex_array.size());
-		else
-			m_last_first_count = std::make_pair(clause.first_count_commands.front().first, clause.first_count_commands.back().second);
-
-		m_index_buffer_changed = false;
-		m_vertex_attribs_changed = false;
 	}

 	void thread::on_task()
@ -543,20 +506,6 @@ namespace rsx
 			m_vblank_thread->join();
 			m_vblank_thread.reset();
 		}
-
-		if (m_vertex_streaming_task.available_threads > 0)
-		{
-			for (auto &task : m_vertex_streaming_task.worker_threads)
-			{
-				if (!task.worker_thread)
-					break;
-
-				task.worker_thread->join();
-				task.worker_thread.reset();
-			}
-
-			m_vertex_streaming_task.available_threads = 0;
-		}
 	}

 	std::string thread::get_name() const
@ -920,9 +869,10 @@ namespace rsx
 		return rsx::get_address(offset_zeta, m_context_dma_z);
 	}

-	RSXVertexProgram thread::get_current_vertex_program() const
+	void thread::get_current_vertex_program()
 	{
-		RSXVertexProgram result = {};
+		auto &result = current_vertex_program = {};
+
 		const u32 transform_program_start = rsx::method_registers.transform_program_start();
 		result.data.reserve((512 - transform_program_start) * 4);
 		result.rsx_vertex_inputs.reserve(rsx::limits::vertex_count);
@ -980,16 +930,144 @@ namespace rsx
 						is_int_type(rsx::method_registers.vertex_arrays_info[index].type()), 0});
 			}
 		}
+	}
+
+	vertex_input_layout thread::analyse_inputs_interleaved() const
+	{
+		const rsx_state& state = rsx::method_registers;
+		const u32 input_mask = state.vertex_attrib_input_mask();
+
+		if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			vertex_input_layout result = {};
+
+			interleaved_range_info info = {};
+			info.interleaved = true;
+			info.locations.reserve(8);
+
+			for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
+			{
+				const u32 mask = (1u << index);
+				auto &vinfo = state.vertex_arrays_info[index];
+
+				if (vinfo.size() > 0)
+				{
+					info.locations.push_back(index);
+					info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size());
+					result.attribute_placement[index] = attribute_buffer_placement::transient;
+				}
+			}
+
+			result.interleaved_blocks.push_back(info);
+			return result;
+		}
+
+		const u32 frequency_divider_mask = rsx::method_registers.frequency_divider_operation_mask();
+		vertex_input_layout result = {};
+
+		for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
+		{
+			const bool enabled = !!(input_mask & (1 << index));
+			if (!enabled)
+				continue;
+
+			if (vertex_push_buffers[index].size > 0)
+			{
+				std::pair<u8, u32> volatile_range_info = std::make_pair(index, vertex_push_buffers[index].data.size() * (u32)sizeof(u32));
+				result.volatile_blocks.push_back(volatile_range_info);
+				result.attribute_placement[index] = attribute_buffer_placement::transient;
+				continue;
+			}
+
+			//Check for interleaving
+			auto &info = state.vertex_arrays_info[index];
+			if (info.size() == 0 && state.register_vertex_info[index].size > 0)
+			{
+				//Reads from register
+				result.referenced_registers.push_back(index);
+				result.attribute_placement[index] = attribute_buffer_placement::transient;
+				continue;
+			}
+
+			if (info.size() > 0)
+			{
+				result.attribute_placement[index] = attribute_buffer_placement::persistent;
+				const u32 base_address = info.offset() & 0x7fffffff;
+				bool alloc_new_block = true;
+
+				for (auto &block : result.interleaved_blocks)
+				{
+					if (block.attribute_stride != info.stride())
+					{
+						//Stride does not match, continue
+						continue;
+					}
+
+					if (base_address > block.base_offset)
+					{
+						const u32 diff = base_address - block.base_offset;
+						if (diff > info.stride())
+						{
+							//Not interleaved, continue
+							continue;
+						}
+					}
+					else
+					{
+						const u32 diff = block.base_offset - base_address;
+						if (diff > info.stride())
+						{
+							//Not interleaved, continue
+							continue;
+						}
+
+						//Matches, and this address is lower than existing
+						block.base_offset = base_address;
+					}
+
+					alloc_new_block = false;
+					block.locations.push_back(index);
+					block.interleaved = true;
+					block.min_divisor = std::min(block.min_divisor, info.frequency());
+
+					if (block.all_modulus)
+						block.all_modulus = !!(frequency_divider_mask & (1 << index));
+
+					break;
+				}
+
+				if (alloc_new_block)
+				{
+					interleaved_range_info block = {};
+					block.base_offset = base_address;
+					block.attribute_stride = info.stride();
+					block.memory_location = info.offset() >> 31;
+					block.locations.reserve(4);
+					block.locations.push_back(index);
+					block.min_divisor = info.frequency();
+					block.all_modulus = !!(frequency_divider_mask & (1 << index));
+
+					result.interleaved_blocks.push_back(block);
+				}
+			}
+		}
+
+		for (auto &info : result.interleaved_blocks)
+		{
+			//Calculate real data address to be used during upload
+			info.real_offset_address = state.vertex_data_base_offset() + rsx::get_address(info.base_offset, info.memory_location);
+		}
+
 		return result;
 	}

-	RSXFragmentProgram thread::get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info) const
+	void thread::get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info)
 	{
-		RSXFragmentProgram result = {};
+		auto &result = current_fragment_program = {};

 		const u32 shader_program = rsx::method_registers.shader_program_address();
 		if (shader_program == 0)
-			return result;
+			return;

 		const u32 program_location = (shader_program & 0x3) - 1;
 		const u32 program_offset = (shader_program & ~0x3);
@ -1064,8 +1142,6 @@ namespace rsx
 		}

 		result.set_texture_dimension(texture_dimensions);
-
-		return result;
 	}

 	void thread::reset()
@ -1141,121 +1217,282 @@ namespace rsx
 		}
 	}

-	void thread::post_vertex_stream_to_upload(gsl::span<const gsl::byte> src, gsl::span<gsl::byte> dst, rsx::vertex_base_type type, u32 vector_element_count,
-			u32 attribute_src_stride, u8 dst_stride, u32 vertex_count, std::function<void(void *, rsx::vertex_base_type, u8, u32)> callback)
+	std::pair<u32, u32> thread::calculate_memory_requirements(vertex_input_layout& layout, const u32 vertex_count)
 	{
-		upload_stream_packet packet;
-		packet.dst_span = dst;
-		packet.src_span = src;
-		packet.src_stride = attribute_src_stride;
-		packet.type = type;
-		packet.dst_stride = dst_stride;
-		packet.vector_width = vector_element_count;
-		packet.post_upload_func = callback;
-		packet.vertex_count = vertex_count;
+		u32 persistent_memory_size = 0;
+		u32 volatile_memory_size = 0;

-		if (m_vertex_streaming_task.available_threads == 0)
+		volatile_memory_size += (u32)layout.referenced_registers.size() * 16u;
+
+		if (rsx::method_registers.current_draw_clause.is_immediate_draw)
 		{
-			const u32 streaming_thread_count = (u32)g_cfg.video.vertex_upload_threads;
-			m_vertex_streaming_task.available_threads = streaming_thread_count;
-
-			for (u32 n = 0; n < streaming_thread_count; ++n)
+			for (auto &info : layout.volatile_blocks)
 			{
-				thread_ctrl::spawn(m_vertex_streaming_task.worker_threads[n].worker_thread, "Vertex Stream " + std::to_string(n), [this, n]()
+				volatile_memory_size += info.second;
+			}
+		}
+		else if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			for (auto &block : layout.interleaved_blocks)
+			{
+				volatile_memory_size += block.attribute_stride * vertex_count;
+			}
+		}
+		else
+		{
+			for (auto &block : layout.interleaved_blocks)
+			{
+				u32 unique_verts;
+
+				if (block.min_divisor > 1)
 				{
-					auto &owner = m_vertex_streaming_task;
-					auto &task = m_vertex_streaming_task.worker_threads[n];
-					const u32 index = n;
-
-					while (!Emu.IsStopped())
+					if (block.all_modulus)
+						unique_verts = block.min_divisor;
+					else
 					{
-						if (task.thread_status.load(std::memory_order_consume) != 0)
-						{
-							for (auto &packet: task.packets)
-							{
-								write_vertex_array_data_to_buffer(packet.dst_span, packet.src_span, packet.vertex_count, packet.type, packet.vector_width, packet.src_stride, packet.dst_stride);
-
-								if (packet.post_upload_func)
-									packet.post_upload_func(packet.dst_span.data(), packet.type, (u8)packet.vector_width, packet.vertex_count);
-
-								owner.remaining_tasks--;
-							}
-
-							task.packets.resize(0);
-							task.thread_status.store(0);
-							_mm_sfence();
-						}
-
-						std::this_thread::yield();
+						unique_verts = vertex_count / block.min_divisor;
+						if (vertex_count % block.min_divisor) unique_verts++;
 					}
-				});
+				}
+				else
+					unique_verts = vertex_count;
+
+				persistent_memory_size += block.attribute_stride * unique_verts;
 			}
 		}

-		//Increment job counter..
-		m_vertex_streaming_task.remaining_tasks++;
+		return std::make_pair(persistent_memory_size, volatile_memory_size);
+	}

-		//Assign this packet to a thread
-		//Simple round robin based on first available thread
-		upload_stream_worker *best_fit = nullptr;
-		for (auto &worker : m_vertex_streaming_task.worker_threads)
+	void thread::fill_vertex_layout_state(vertex_input_layout& layout, const u32 vertex_count, s32* buffer)
+	{
+		std::array<u32, 16> offset_in_block = {};
+		u32 volatile_offset = 0;
+		u32 persistent_offset = 0;
+
+		if (rsx::method_registers.current_draw_clause.is_immediate_draw)
 		{
-			if (!worker.worker_thread)
-				break;
-
-			if (worker.thread_status.load(std::memory_order_consume) == 0)
+			for (auto &info : layout.volatile_blocks)
 			{
-				if (worker.packets.size() == 0)
+				offset_in_block[info.first] = volatile_offset;
+				volatile_offset += info.second;
+			}
+		}
+
+		for (u8 index : layout.referenced_registers)
+		{
+			offset_in_block[index] = volatile_offset;
+			volatile_offset += 16;
+		}
+
+		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			auto &block = layout.interleaved_blocks[0];
+			for (u8 index : block.locations)
+			{
+				auto &info = rsx::method_registers.vertex_arrays_info[index];
+
+				offset_in_block[index] = persistent_offset; //just because this var is 0 when we enter here; inlined is transient memory
+				persistent_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size());
+			}
+		}
+		else
+		{
+			for (auto &block : layout.interleaved_blocks)
+			{
+				for (u8 index : block.locations)
 				{
-					worker.packets.push_back(packet);
-					return;
+					const u32 local_address = (rsx::method_registers.vertex_arrays_info[index].offset() & 0x7fffffff);
+					offset_in_block[index] = persistent_offset + (local_address - block.base_offset);
 				}

-				if (best_fit == nullptr)
-					best_fit = &worker;
-				else if (best_fit->packets.size() > worker.packets.size())
-					best_fit = &worker;
+				u32 unique_verts;
+
+				if (block.min_divisor > 1)
+				{
+					if (block.all_modulus)
+						unique_verts = block.min_divisor;
+					else
+					{
+						unique_verts = vertex_count / block.min_divisor;
+						if (vertex_count % block.min_divisor) unique_verts++;
+					}
+				}
+				else
+					unique_verts = vertex_count;
+
+				persistent_offset += block.attribute_stride * unique_verts;
 			}
 		}

-		best_fit->packets.push_back(packet);
-	}
+		//Fill the data
+		memset(buffer, 0, 256);

-	void thread::start_vertex_upload_task()
-	{
-		for (auto &worker : m_vertex_streaming_task.worker_threads)
+		const s32 swap_storage_mask = (1 << 8);
+		const s32 volatile_storage_mask = (1 << 9);
+		const s32 default_frequency_mask = (1 << 10);
+		const s32 repeating_frequency_mask = (3 << 10);
+		const s32 input_function_modulo_mask = (1 << 12);
+		const s32 input_divisor_mask = (0xFFFF << 13);
+
+		const u32 modulo_mask = rsx::method_registers.frequency_divider_operation_mask();
+
+		for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
 		{
-			if (!worker.worker_thread)
-				break;
+			if (layout.attribute_placement[index] == attribute_buffer_placement::none)
+				continue;

-			if (worker.thread_status.load(std::memory_order_consume) == 0 && worker.packets.size() > 0)
+			rsx::vertex_base_type type = {};
+			s32 size = 0;
+			s32 attributes = 0;
+
+			bool is_be_type = true;
+
+			if (layout.attribute_placement[index] == attribute_buffer_placement::transient)
 			{
-				worker.thread_status.store(1);
+				if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
+				{
+					auto &info = rsx::method_registers.vertex_arrays_info[index];
+					type = info.type();
+					size = (s32)info.size();
+
+					attributes = layout.interleaved_blocks[0].attribute_stride;
+					attributes |= default_frequency_mask | volatile_storage_mask;
+
+					is_be_type = false;
+				}
+				else if (rsx::method_registers.current_draw_clause.is_immediate_draw)
+				{
+					auto &info = rsx::method_registers.register_vertex_info[index];
+					type = info.type;
+					size = (s32)info.size;
+
+					attributes = rsx::get_vertex_type_size_on_host(type, size);
+					attributes |= default_frequency_mask | volatile_storage_mask;
+
+					is_be_type = true;
+				}
+				else
+				{
+					//Register
+					auto& info = rsx::method_registers.register_vertex_info[index];
+					type = info.type;
+					size = (s32)info.size;
+
+					attributes = rsx::get_vertex_type_size_on_host(type, size);
+					attributes |= volatile_storage_mask;
+
+					is_be_type = false;
+				}
+			}
+			else
+			{
+				auto &info = rsx::method_registers.vertex_arrays_info[index];
+				type = info.type();
+				size = info.size();
+
+				const u32 frequency = info.frequency();
+				switch (frequency)
+				{
+				case 0:
+				case 1:
+					attributes |= default_frequency_mask;
+					break;
+				default:
+				{
+					if (modulo_mask & (1 << index))
+						attributes |= input_function_modulo_mask;
+
+					attributes |= repeating_frequency_mask;
+					attributes |= (frequency << 13) & input_divisor_mask;
+				}
+				}
+
+				attributes |= info.stride();
+			} //end attribute placement check
+
+			switch (type)
+			{
+			case rsx::vertex_base_type::cmp:
+				size = 1;
+				//fall through
+			default:
+				if (is_be_type) attributes |= swap_storage_mask;
+				break;
+			case rsx::vertex_base_type::ub:
+			case rsx::vertex_base_type::ub256:
+				if (!is_be_type) attributes |= swap_storage_mask;
+				break;
+			}
+
+			buffer[index * 4 + 0] = (s32)type;
+			buffer[index * 4 + 1] = size;
+			buffer[index * 4 + 2] = (s32)offset_in_block[index];
+			buffer[index * 4 + 3] = attributes;
+		}
+	}
+
+	void thread::write_vertex_data_to_memory(vertex_input_layout &layout, const u32 first_vertex, const u32 vertex_count, void *persistent_data, void *volatile_data)
+	{
+		char *transient = (char *)volatile_data;
+		char *persistent = (char *)persistent_data;
+
+		auto &draw_call = rsx::method_registers.current_draw_clause;
+
+		if (transient != nullptr)
+		{
+			if (draw_call.command == rsx::draw_command::inlined_array)
+			{
+				memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32));
+				//Is it possible to reference data outside of the inlined array?
+				return;
+			}
+
+			for (u8 index : layout.referenced_registers)
+			{
+				memcpy(transient, rsx::method_registers.register_vertex_info[index].data.data(), 16);
+				transient += 16;
+			}
+
+			if (draw_call.is_immediate_draw)
+			{
+				for (auto &info : layout.volatile_blocks)
+				{
+					memcpy(transient, vertex_push_buffers[info.first].data.data(), info.second);
+					transient += info.second;
+				}
+
+				return;
 			}
 		}
-	}

-	void thread::wait_for_vertex_upload_task()
-	{
-		while (m_vertex_streaming_task.remaining_tasks.load(std::memory_order_consume) != 0 && !Emu.IsStopped())
+		if (persistent != nullptr)
 		{
-			_mm_pause();
+			for (auto &block : layout.interleaved_blocks)
+			{
+				u32 unique_verts;
+				u32 vertex_base = first_vertex * block.attribute_stride;
+
+				if (block.min_divisor > 1)
+				{
+					if (block.all_modulus)
+						unique_verts = block.min_divisor;
+					else
+					{
+						unique_verts = vertex_count / block.min_divisor;
+						if (vertex_count % block.min_divisor) unique_verts++;
+					}
+				}
+				else
+					unique_verts = vertex_count;
+
+				const u32 data_size = block.attribute_stride * unique_verts;
+				memcpy(persistent, (char*)vm::base(block.real_offset_address) + vertex_base, data_size);
+				persistent += data_size;
+			}
 		}
 	}

-	bool thread::vertex_upload_task_ready()
-	{
-		if (g_cfg.video.vertex_upload_threads < 2)
-			return false;
-
-		//Not initialized
-		if (m_vertex_streaming_task.available_threads == 0)
-			return true;
-
-		//At least two threads are available
-		return (m_vertex_streaming_task.remaining_tasks < (m_vertex_streaming_task.available_threads - 1));
-	}
-
 	void thread::flip(int buffer)
 	{
 		if (g_cfg.video.frame_skip_enabled)
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -105,6 +105,35 @@ namespace rsx
 		std::vector<u32> inline_vertex_array;
 	};

+	struct interleaved_range_info
+	{
+		bool interleaved = false;
+		bool all_modulus = false;
+		u32  base_offset = 0;
+		u32  real_offset_address = 0;
+		u8   memory_location = 0;
+		u8   attribute_stride = 0;
+		u16  min_divisor = 0;
+
+		std::vector<u8> locations;
+	};
+
+	enum attribute_buffer_placement : u8
+	{
+		none = 0,
+		persistent = 1,
+		transient = 2
+	};
+
+	struct vertex_input_layout
+	{
+		std::vector<interleaved_range_info> interleaved_blocks;  //Interleaved blocks to be uploaded as-is
+		std::vector<std::pair<u8, u32>> volatile_blocks;  //Volatile data blocks (immediate draw vertex data for example)
+		std::vector<u8> referenced_registers;  //Volatile register data
+
+		std::array<attribute_buffer_placement, 16> attribute_placement;
+	};
+
 	class thread : public named_thread
 	{
 		std::shared_ptr<thread_ctrl> m_vblank_thread;
@ -152,8 +181,6 @@ namespace rsx
 		bool m_rtts_dirty;
 		bool m_transform_constants_dirty;
 		bool m_textures_dirty[16];
-		bool m_vertex_attribs_changed;
-		bool m_index_buffer_changed;

 	protected:
 		s32 m_skip_frame_ctr = 0;
@ -161,14 +188,23 @@ namespace rsx
 	protected:
 		std::array<u32, 4> get_color_surface_addresses() const;
 		u32 get_zeta_surface_address() const;
-		RSXVertexProgram get_current_vertex_program() const;
+
+		/**
+		 * Analyze vertex inputs and group all interleaved blocks
+		 */
+		vertex_input_layout analyse_inputs_interleaved() const;
+
+		RSXVertexProgram current_vertex_program = {};
+		RSXFragmentProgram current_fragment_program = {};
+
+		void get_current_vertex_program();

 		/**
 		 * Gets current fragment program and associated fragment state
 		 * get_surface_info is a helper takes 2 parameters: rsx_texture_address and surface_is_depth
 		 * returns whether surface is a render target and surface pitch in native format
 		 */
-		RSXFragmentProgram get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info) const;
+		void get_current_fragment_program(std::function<std::tuple<bool, u16>(u32, fragment_texture&, bool)> get_surface_info);
 	public:
 		double fps_limit = 59.94;

@ -232,7 +268,7 @@ namespace rsx
 		std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
 		get_draw_command(const rsx::rsx_state& state) const;

-		/*
+		/**
 		* Immediate mode rendering requires a temp push buffer to hold attrib values
 		* Appends a value to the push buffer (currently only supports 32-wide types)
 		*/
@ -243,47 +279,24 @@ namespace rsx
 		u32 get_push_buffer_index_count() const;

 	protected:
-		//Save draw call parameters to detect instanced renders
-		std::pair<u32, u32> m_last_first_count;
-		rsx::draw_command m_last_command;

-		bool is_probable_instanced_draw();
+		/**
+		 * Computes VRAM requirements needed to upload raw vertex streams
+		 * result.first contains persistent memory requirements
+		 * result.second contains volatile memory requirements
+		 */
+		std::pair<u32, u32> calculate_memory_requirements(vertex_input_layout& layout, const u32 vertex_count);

-	public:
-		//MT vertex streaming
-		struct upload_stream_packet
-		{
-			std::function<void(void *, rsx::vertex_base_type, u8, u32)> post_upload_func;
-			gsl::span<const gsl::byte> src_span;
-			gsl::span<gsl::byte> dst_span;
-			rsx::vertex_base_type type;
-			u32 vector_width;
-			u32 vertex_count;
-			u32 src_stride;
-			u8 dst_stride;
-		};
+		/**
+		 * Generates vertex input descriptors as an array of 16x4 s32s
+		 */
+		void fill_vertex_layout_state(vertex_input_layout& layout, const u32 vertex_count, s32* buffer);

-		struct upload_stream_worker
-		{
-			std::shared_ptr<thread_ctrl> worker_thread;
-			std::vector<upload_stream_packet> packets;
-			std::atomic<int> thread_status = { 0 };
-		};
-
-		struct upload_stream_task
-		{
-			std::array<upload_stream_worker, 16> worker_threads;
-			int available_threads = 0;
-			std::atomic<int> remaining_tasks = {0};
-		};
-
-		upload_stream_task m_vertex_streaming_task;
-		void post_vertex_stream_to_upload(gsl::span<const gsl::byte> src, gsl::span<gsl::byte> dst, rsx::vertex_base_type type,
-				u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, u32 vertex_count,
-			std::function<void(void *, rsx::vertex_base_type, u8, u32)> callback);
-		void start_vertex_upload_task();
-		void wait_for_vertex_upload_task();
-		bool vertex_upload_task_ready();
+		/**
+		 * Uploads vertex data described in the layout descriptor
+		 * Copies from local memory to the write-only output buffers provided in a sequential manner
+		 */
+		void write_vertex_data_to_memory(vertex_input_layout &layout, const u32 first_vertex, const u32 vertex_count, void *persistent_data, void *volatile_data);

 	private:
 		std::mutex m_mtx_task;
--- a/rpcs3/Emu/RSX/RSXVertexProgram.h
+++ b/rpcs3/Emu/RSX/RSXVertexProgram.h
@ -228,4 +228,5 @@ struct RSXVertexProgram
 	std::vector<u32> data;
 	std::vector<rsx_vertex_input> rsx_vertex_inputs;
 	u32 output_mask;
+	bool skip_vertex_input_check;
 };
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -736,6 +736,8 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 			if (!flushable)
 				return false;

+			close_render_pass();
+
 			if (synchronized)
 			{
 				if (m_last_flushable_cb >= 0)
@ -817,7 +819,6 @@ void VKGSRender::begin()
 		m_vertex_cache->purge();

 		CHECK_RESULT(vkResetDescriptorPool(*m_device, descriptor_pool, 0));
-		m_last_descriptor_set = VK_NULL_HANDLE;
 		m_used_descriptors = 0;

 		m_uniform_buffer_ring_info.reset_allocation_stats();
@ -858,20 +859,6 @@ void VKGSRender::begin()
 	m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
 }

-void VKGSRender::emit_geometry_instance(u32/* instance_count*/)
-{
-	begin_render_pass();
-
-	m_instanced_draws++;
-	//Repeat last command
-	if (!m_last_draw_indexed)
-		vkCmdDraw(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0);
-	else
-	{
-		vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, m_last_ib_offset, m_last_ib_type);
-		vkCmdDrawIndexed(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0, 0);
-	}
-}

 void VKGSRender::begin_render_pass()
 {
@ -916,39 +903,8 @@ void VKGSRender::end()

 	std::chrono::time_point<steady_clock> program_start = steady_clock::now();

-	const bool is_instanced = is_probable_instanced_draw() && m_last_descriptor_set != VK_NULL_HANDLE && m_program != nullptr;
-
-	if (is_instanced)
-	{
-		//Copy descriptor set
-		VkCopyDescriptorSet copy_info[39];
-		u8 descriptors_count = 0;
-		
-		for (u8 i = 0; i < 39; ++i)
-		{
-			if ((m_program->attribute_location_mask & (1ull << i)) == 0)
-				continue;
-
-			const u8 n = descriptors_count;
-
-			copy_info[n] = {};
-			copy_info[n].sType = VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET;
-			copy_info[n].srcSet = m_last_descriptor_set;
-			copy_info[n].dstSet = descriptor_sets;
-			copy_info[n].srcBinding = i;
-			copy_info[n].dstBinding = i;
-			copy_info[n].srcArrayElement = 0;
-			copy_info[n].dstArrayElement = 0;
-			copy_info[n].descriptorCount = 1;
-
-			descriptors_count++;
-		}
-
-		vkUpdateDescriptorSets(*m_device, 0, nullptr, descriptors_count, (const VkCopyDescriptorSet*)&copy_info);
-	}
-
 	//Load program here since it is dependent on vertex state
-	if (!load_program(is_instanced))
+	if (!load_program())
 	{
 		LOG_ERROR(RSX, "No valid program bound to pipeline. Skipping draw");
 		rsx::thread::end();
@ -958,18 +914,6 @@ void VKGSRender::end()
 	std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
 	//m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();

-	if (is_instanced)
-	{
-		//Only the program constants descriptors should have changed
-		vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline);
-		vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &descriptor_sets, 0, nullptr);
-
-		emit_geometry_instance(1);
-		m_last_instanced_cb_index = m_current_cb_index;
-		rsx::thread::end();
-		return;
-	}
-
 	close_render_pass();	//Texture upload stuff conflicts active RPs

 	if (g_cfg.video.strict_rendering_mode)
@ -1173,9 +1117,6 @@ void VKGSRender::end()
 	{
 		const auto vertex_count = std::get<1>(upload_info);
 		vkCmdDraw(*m_current_command_buffer, vertex_count, 1, 0, 0);
-
-		m_last_vertex_count = vertex_count;
-		m_last_draw_indexed = false;
 	}
 	else
 	{
@ -1187,16 +1128,8 @@ void VKGSRender::end()

 		vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
 		vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0);
-
-		m_last_draw_indexed = false;
-		m_last_ib_type = index_type;
-		m_last_ib_offset = offset;
-		m_last_vertex_count = index_count;
 	}

-	m_last_instanced_cb_index = ~0;
-	m_last_descriptor_set = descriptor_sets;
-
 	vk::leave_uninterruptible();

 	std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
@ -1207,15 +1140,17 @@ void VKGSRender::end()

 	if (g_cfg.video.overlay)
 	{
-		if (m_last_vertex_count < 1024)
+		const auto vertex_count = std::get<1>(upload_info);
+
+		if (vertex_count < 1024)
 			m_uploads_small++;
-		else if (m_last_vertex_count < 2048)
+		else if (vertex_count < 2048)
 			m_uploads_1k++;
-		else if (m_last_vertex_count < 4096)
+		else if (vertex_count < 4096)
 			m_uploads_2k++;
-		else if (m_last_vertex_count < 8192)
+		else if (vertex_count < 8192)
 			m_uploads_4k++;
-		else if (m_last_vertex_count < 16384)
+		else if (vertex_count < 16384)
 			m_uploads_8k++;
 		else
 			m_uploads_16k++;
@ -1433,12 +1368,6 @@ void VKGSRender::copy_render_targets_to_dma_location()

 void VKGSRender::flush_command_queue(bool hard_sync)
 {
-	if (m_attrib_ring_info.mapped)
-	{
-		wait_for_vertex_upload_task();
-		m_attrib_ring_info.unmap();
-	}
-
 	close_render_pass();
 	close_and_submit_command_buffer({}, m_current_command_buffer->submit_fence);

@ -1580,187 +1509,182 @@ bool VKGSRender::do_method(u32 cmd, u32 arg)
 	}
 }

-bool VKGSRender::load_program(bool fast_update)
+bool VKGSRender::load_program(bool)
 {
-	RSXVertexProgram vertex_program;
-	RSXFragmentProgram fragment_program;
+	auto &vertex_program = current_vertex_program;
+	auto &fragment_program = current_fragment_program;

-	if (!fast_update)
+	auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple<bool, u16>
 	{
-		auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple<bool, u16>
-		{
-			vk::render_target *surface = nullptr;
-			if (!is_depth)
-				surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr);
-			else
-				surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr);
-
-			if (!surface) return std::make_tuple(false, 0);
-
-			return std::make_tuple(true, surface->native_pitch);
-		};
-
-		fragment_program = get_current_fragment_program(rtt_lookup_func);
-		if (!fragment_program.valid) return false;
-
-		vertex_program = get_current_vertex_program();
-
-		vk::pipeline_props properties = {};
-
-		properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-		bool unused;
-		properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused);
-
-		if (rsx::method_registers.restart_index_enabled())
-		{
-			properties.ia.primitiveRestartEnable = VK_TRUE;
-		}
+		vk::render_target *surface = nullptr;
+		if (!is_depth)
+			surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr);
 		else
-			properties.ia.primitiveRestartEnable = VK_FALSE;
+			surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr);
+
+		if (!surface) return std::make_tuple(false, 0);
+
+		return std::make_tuple(true, surface->native_pitch);
+	};
+
+	get_current_fragment_program(rtt_lookup_func);
+	if (!fragment_program.valid) return false;
+
+	get_current_vertex_program();
+
+	vk::pipeline_props properties = {};
+
+	properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+	bool unused;
+	properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused);
+
+	if (rsx::method_registers.restart_index_enabled())
+	{
+		properties.ia.primitiveRestartEnable = VK_TRUE;
+	}
+	else
+		properties.ia.primitiveRestartEnable = VK_FALSE;


-		for (int i = 0; i < 4; ++i)
-		{
-			properties.att_state[i].colorWriteMask = 0xf;
-			properties.att_state[i].blendEnable = VK_FALSE;
-		}
+	for (int i = 0; i < 4; ++i)
+	{
+		properties.att_state[i].colorWriteMask = 0xf;
+		properties.att_state[i].blendEnable = VK_FALSE;
+	}

-		VkColorComponentFlags mask = 0;
-		if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT;
-		if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT;
-		if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT;
-		if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT;
+	VkColorComponentFlags mask = 0;
+	if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT;
+	if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT;
+	if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT;
+	if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT;

-		VkColorComponentFlags color_masks[4] = { mask };
+	VkColorComponentFlags color_masks[4] = { mask };

-		u8 render_targets[] = { 0, 1, 2, 3 };
+	u8 render_targets[] = { 0, 1, 2, 3 };
+
+	for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
+	{
+		properties.att_state[render_targets[idx]].colorWriteMask = mask;
+	}
+
+	if (rsx::method_registers.blend_enabled())
+	{
+		VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
+		VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
+		VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
+		VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
+
+		VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
+		VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());

 		for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
 		{
-			properties.att_state[render_targets[idx]].colorWriteMask = mask;
+			properties.att_state[render_targets[idx]].blendEnable = VK_TRUE;
+			properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb;
+			properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb;
+			properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a;
+			properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a;
+			properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb;
+			properties.att_state[render_targets[idx]].alphaBlendOp = equation_a;
 		}

-		if (rsx::method_registers.blend_enabled())
-		{
-			VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
-			VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
-			VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
-			VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
-
-			VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
-			VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());
-
-			for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
-			{
-				properties.att_state[render_targets[idx]].blendEnable = VK_TRUE;
-				properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb;
-				properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb;
-				properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a;
-				properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a;
-				properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb;
-				properties.att_state[render_targets[idx]].alphaBlendOp = equation_a;
-			}
-
-			auto blend_colors = rsx::get_constant_blend_colors();
-			properties.cs.blendConstants[0] = blend_colors[0];
-			properties.cs.blendConstants[1] = blend_colors[1];
-			properties.cs.blendConstants[2] = blend_colors[2];
-			properties.cs.blendConstants[3] = blend_colors[3];
-		}
-		else
-		{
-			for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
-			{
-				properties.att_state[render_targets[idx]].blendEnable = VK_FALSE;
-			}
-		}
-
-		properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-		properties.cs.attachmentCount = m_draw_buffers_count;
-		properties.cs.pAttachments = properties.att_state;
-
-		if (rsx::method_registers.logic_op_enabled())
-		{
-			properties.cs.logicOpEnable = true;
-			properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation());
-		}
-
-		properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
-		properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE;
-
-		if (rsx::method_registers.depth_bounds_test_enabled())
-		{
-			properties.ds.depthBoundsTestEnable = VK_TRUE;
-			properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min();
-			properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max();
-		}
-		else
-			properties.ds.depthBoundsTestEnable = VK_FALSE;
-
-		if (rsx::method_registers.stencil_test_enabled())
-		{
-			properties.ds.stencilTestEnable = VK_TRUE;
-			properties.ds.front.writeMask = rsx::method_registers.stencil_mask();
-			properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask();
-			properties.ds.front.reference = rsx::method_registers.stencil_func_ref();
-			properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail());
-			properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass());
-			properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail());
-			properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func());
-
-			if (rsx::method_registers.two_sided_stencil_test_enabled())
-			{
-				properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask();
-				properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask();
-				properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref();
-				properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail());
-				properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass());
-				properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail());
-				properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func());
-			}
-			else
-				properties.ds.back = properties.ds.front;
-		}
-		else
-			properties.ds.stencilTestEnable = VK_FALSE;
-
-		if (rsx::method_registers.depth_test_enabled())
-		{
-			properties.ds.depthTestEnable = VK_TRUE;
-			properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func());
-		}
-		else
-			properties.ds.depthTestEnable = VK_FALSE;
-
-		properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
-		properties.rs.polygonMode = VK_POLYGON_MODE_FILL;
-		properties.rs.depthClampEnable = VK_FALSE;
-		properties.rs.rasterizerDiscardEnable = VK_FALSE;
-		properties.rs.depthBiasEnable = VK_FALSE;
-
-		if (rsx::method_registers.cull_face_enabled())
-			properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode());
-		else
-			properties.rs.cullMode = VK_CULL_MODE_NONE;
-
-		properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode());
-
-		size_t idx = vk::get_render_pass_location(
-			vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first,
-			vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()),
-			(u8)m_draw_buffers_count);
-
-		properties.render_pass = m_render_passes[idx];
-
-		properties.num_targets = m_draw_buffers_count;
-
-		vk::enter_uninterruptible();
-
-		//Load current program from buffer
-		m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get();
+		auto blend_colors = rsx::get_constant_blend_colors();
+		properties.cs.blendConstants[0] = blend_colors[0];
+		properties.cs.blendConstants[1] = blend_colors[1];
+		properties.cs.blendConstants[2] = blend_colors[2];
+		properties.cs.blendConstants[3] = blend_colors[3];
 	}
 	else
-		vk::enter_uninterruptible();
+	{
+		for (u8 idx = 0; idx < m_draw_buffers_count; ++idx)
+		{
+			properties.att_state[render_targets[idx]].blendEnable = VK_FALSE;
+		}
+	}
+
+	properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+	properties.cs.attachmentCount = m_draw_buffers_count;
+	properties.cs.pAttachments = properties.att_state;
+
+	if (rsx::method_registers.logic_op_enabled())
+	{
+		properties.cs.logicOpEnable = true;
+		properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation());
+	}
+
+	properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+	properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE;
+
+	if (rsx::method_registers.depth_bounds_test_enabled())
+	{
+		properties.ds.depthBoundsTestEnable = VK_TRUE;
+		properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min();
+		properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max();
+	}
+	else
+		properties.ds.depthBoundsTestEnable = VK_FALSE;
+
+	if (rsx::method_registers.stencil_test_enabled())
+	{
+		properties.ds.stencilTestEnable = VK_TRUE;
+		properties.ds.front.writeMask = rsx::method_registers.stencil_mask();
+		properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask();
+		properties.ds.front.reference = rsx::method_registers.stencil_func_ref();
+		properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail());
+		properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass());
+		properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail());
+		properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func());
+
+		if (rsx::method_registers.two_sided_stencil_test_enabled())
+		{
+			properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask();
+			properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask();
+			properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref();
+			properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail());
+			properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass());
+			properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail());
+			properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func());
+		}
+		else
+			properties.ds.back = properties.ds.front;
+	}
+	else
+		properties.ds.stencilTestEnable = VK_FALSE;
+
+	if (rsx::method_registers.depth_test_enabled())
+	{
+		properties.ds.depthTestEnable = VK_TRUE;
+		properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func());
+	}
+	else
+		properties.ds.depthTestEnable = VK_FALSE;
+
+	properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+	properties.rs.polygonMode = VK_POLYGON_MODE_FILL;
+	properties.rs.depthClampEnable = VK_FALSE;
+	properties.rs.rasterizerDiscardEnable = VK_FALSE;
+	properties.rs.depthBiasEnable = VK_FALSE;
+
+	if (rsx::method_registers.cull_face_enabled())
+		properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode());
+	else
+		properties.rs.cullMode = VK_CULL_MODE_NONE;
+
+	properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode());
+
+	size_t idx = vk::get_render_pass_location(
+		vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first,
+		vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()),
+		(u8)m_draw_buffers_count);
+
+	properties.render_pass = m_render_passes[idx];
+
+	properties.num_targets = m_draw_buffers_count;
+
+	vk::enter_uninterruptible();
+
+	//Load current program from buffer
+	m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get();

 	//TODO: Update constant buffers..
 	//1. Update scale-offset matrix
@ -1781,7 +1705,7 @@ bool VKGSRender::load_program(bool fast_update)

 	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, scale_offset_offset, 256 }, SCALE_OFFSET_BIND_SLOT, descriptor_sets);

-	if (!fast_update || m_transform_constants_dirty)
+	if (true)//m_transform_constants_dirty)
 	{
 		const size_t vertex_constants_offset = m_uniform_buffer_ring_info.alloc<256>(512 * 4 * sizeof(float));
 		buf = (u8*)m_uniform_buffer_ring_info.map(vertex_constants_offset, 512 * 4 * sizeof(float));
@ -1793,21 +1717,18 @@ bool VKGSRender::load_program(bool fast_update)
 		m_transform_constants_dirty = false;
 	}

-	if (!fast_update)
-	{
-		const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
-		const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float));
-		const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz);
+	const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
+	const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float));
+	const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz);

-		buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz);
-		if (fragment_constants_sz)
-			m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) }, fragment_program);
+	buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz);
+	if (fragment_constants_sz)
+		m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), ::narrow<int>(fragment_constants_sz) }, fragment_program);

-		fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program);
-		m_uniform_buffer_ring_info.unmap();
+	fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program);
+	m_uniform_buffer_ring_info.unmap();

-		m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);
-	}
+	m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets);

 	vk::leave_uninterruptible();

--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -200,14 +200,6 @@ private:

 	std::thread::id rsx_thread;

-	VkPrimitiveTopology m_last_primititve_type;
-	VkIndexType m_last_ib_type;
-	VkDescriptorSet m_last_descriptor_set;
-	size_t m_last_ib_offset;
-	u32 m_last_vertex_count;
-	bool m_last_draw_indexed;
-	u32 m_last_instanced_cb_index;
-
 	bool render_pass_open = false;
 	
 #ifdef __linux__
@ -233,8 +225,6 @@ private:
 	void begin_render_pass();
 	void close_render_pass();

-	void emit_geometry_instance(u32 instance_count);
-
 	/// returns primitive topology, is_indexed, index_count, offset in index buffer, index type
 	std::tuple<VkPrimitiveTopology, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
 public:
--- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
@ -265,10 +265,18 @@ namespace
 				return;

 			// Fill vertex_array
-			u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
-			u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
+			const u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size);
+			const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
+			const u32 upload_size = real_element_size * vertex_count;
+			const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);
+			const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();

-			u32 upload_size = real_element_size * vertex_count;
+			if (auto found = vertex_cache->find_vertex_range(local_addr, format, upload_size))
+			{
+				m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, found->offset_in_heap, upload_size));
+				m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_array.index], descriptor_sets);
+				return;
+			}

 			VkDeviceSize offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(upload_size);
 			void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
@ -281,9 +289,7 @@ namespace
 			vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);

 			m_attrib_ring_info.unmap();
-			const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);

-			const uintptr_t local_addr = (uintptr_t)vertex_array.data.data();
 			vertex_cache->store_range(local_addr, format, upload_size, (u32)offset_in_attrib_buffer);

 			m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(device, m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, upload_size));
@ -474,137 +480,9 @@ namespace

 			const auto& vertex_buffers = get_vertex_buffers(
 				rsx::method_registers, {{min_index, vertex_max_index - min_index + 1}});
-			
-			//1. Check if we can get all these allocations at once
-			std::vector<size_t> memory_allocations(16);
-			std::vector<u32> allocated_sizes(16);
-			std::vector<int> upload_jobs(16);

-			memory_allocations.resize(0);
-			allocated_sizes.resize(0);
-			upload_jobs.resize(0);
-
-			for (int i = 0; i < vertex_buffers.size(); ++i)
-			{
-				const auto &vbo = vertex_buffers[i];
-				bool can_multithread = false;
-
-				if (vbo.which() == 0)
-				{
-					//vertex array buffer. We can thread this thing heavily
-					const auto& v = vbo.get<rsx::vertex_array_buffer>();
-
-					const u32 element_size = rsx::get_vertex_type_size_on_host(v.type, v.attribute_size);
-					const u32 real_element_size = vk::get_suitable_vk_size(v.type, v.attribute_size);
-					const u32 upload_size = real_element_size * vertex_count;
-					const VkFormat format = vk::get_suitable_vk_format(v.type, v.attribute_size);
-					const uintptr_t local_addr = (uintptr_t)v.data.data();
-
-					const auto cached = rsxthr->m_vertex_cache->find_vertex_range(local_addr, format, upload_size);
-					if (cached)
-					{
-						m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device, m_attrib_ring_info.heap->value, format, cached->offset_in_heap, upload_size));
-						m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets);
-
-						continue;
-					}
-					
-					if (v.attribute_size > 1 && vertex_count >= (u32)g_cfg.video.mt_vertex_upload_threshold && rsxthr->vertex_upload_task_ready())
-					{
-						can_multithread = true;
-					
-						size_t offset = m_attrib_ring_info.alloc<256>(upload_size);
-
-						memory_allocations.push_back(offset);
-						allocated_sizes.push_back(upload_size);
-						upload_jobs.push_back(i);
-
-						const uintptr_t local_addr = (uintptr_t)v.data.data();
-						rsxthr->m_vertex_cache->store_range(local_addr, format, upload_size, (u32)offset);
-
-						m_buffer_view_to_clean.push_back(std::make_unique<vk::buffer_view>(m_device, m_attrib_ring_info.heap->value, format, offset, upload_size));
-						m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets);
-					}
-				}
-				
-				if (!can_multithread)
-					std::apply_visitor(visitor, vbo);
-			}
-
-			if (memory_allocations.size() > 0)
-			{
-				if (memory_allocations.size() > 1)
-				{
-					//2 sets in case the blocks dont fit
-					u8 available_jobs[2] = {};
-					u32 allocated_block[2] = {};
-
-					size_t last_offset = memory_allocations[0];
-					u8 current_index = 0;
-
-					for (int n = 0; n < memory_allocations.size(); ++n)
-					{
-						if (memory_allocations[n] < last_offset)
-						{
-							//queue went around
-							current_index = 1;
-						}
-
-						available_jobs[current_index] ++;
-						allocated_block[current_index] += allocated_sizes[n];
-					}
-
-					int n = 0;
-					for (int task = 0; task < 2; ++task)
-					{
-						if (available_jobs[task])
-						{
-							if (m_attrib_ring_info.mapped)
-							{
-								rsxthr->wait_for_vertex_upload_task();
-								m_attrib_ring_info.unmap();
-							}
-
-							size_t space_remaining = allocated_block[task];
-							size_t offset_base = memory_allocations[n];
-
-							gsl::byte* dst = (gsl::byte*)m_attrib_ring_info.map(memory_allocations[n], space_remaining);
-
-							while (true)
-							{
-								if (space_remaining == 0)
-									break;
-
-								const auto& vertex_array = vertex_buffers[upload_jobs[n]].get<rsx::vertex_array_buffer>();
-								const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
-								
-								gsl::span<gsl::byte> dest_span(dst + (memory_allocations[n] - offset_base), allocated_sizes[n]);
-								rsxthr->post_vertex_stream_to_upload(vertex_array.data, dest_span, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size, vertex_count, vk::prepare_buffer_for_writing);
-
-								space_remaining -= allocated_sizes[n];
-								n++;
-							}
-
-							rsxthr->start_vertex_upload_task();
-						}
-					}
-				}
-				else
-				{
-					const size_t offset_in_attrib_buffer = memory_allocations[0];
-					const u32 upload_size = allocated_sizes[0];
-					const auto& vertex_array = vertex_buffers[upload_jobs[0]].get<rsx::vertex_array_buffer>();
-					const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size);
-
-					void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
-					gsl::span<gsl::byte> dest_span(static_cast<gsl::byte*>(dst), upload_size);
-
-					write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size);
-					vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
-
-					m_attrib_ring_info.unmap();
-				}
-			}
+			for (auto &vbo: vertex_buffers)
+				std::apply_visitor(visitor, vbo);
 		}

 		u32 upload_inlined_array()
--- a/rpcs3/Emu/RSX/gcm_enums.h
+++ b/rpcs3/Emu/RSX/gcm_enums.h
@ -9,7 +9,7 @@ namespace rsx
 		f, ///< float
 		sf, ///< half float
 		ub, ///< unsigned byte interpreted as 0.f and 1.f
-		s32k, ///< signed 32bits int
+		s32k, ///< signed 16bits int
 		cmp, ///< compressed aka X11G11Z10 and always 1. W.
 		ub256, ///< unsigned byte interpreted as between 0 and 255.
 	};
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@ -459,20 +459,6 @@ namespace rsx
 				rsx->m_textures_dirty[index] = true;
 			}
 		};
-
-		template<u32 index>
-		struct set_vertex_array_dirty_bit
-		{
-			static void impl(thread* rsx, u32, u32)
-			{
-				rsx->m_vertex_attribs_changed = true;
-			}
-		};
-
-		void set_idbuf_dirty_bit(thread* rsx, u32, u32)
-		{
-			rsx->m_index_buffer_changed = true;
-		}
 	}

 	namespace nv308a
@ -1544,8 +1530,6 @@ namespace rsx
 		bind_range<NV4097_SET_TEXTURE_FILTER, 8, 16, nv4097::set_texture_dirty_bit>();
 		bind_range<NV4097_SET_TEXTURE_IMAGE_RECT, 8, 16, nv4097::set_texture_dirty_bit>();
 		bind_range<NV4097_SET_TEXTURE_BORDER_COLOR, 8, 16, nv4097::set_texture_dirty_bit>();
-		bind_range<NV4097_SET_VERTEX_DATA_ARRAY_OFFSET, 1, 16, nv4097::set_vertex_array_dirty_bit>();
-		bind<NV4097_SET_INDEX_ARRAY_ADDRESS, nv4097::set_idbuf_dirty_bit>();
 		bind<NV4097_SET_RENDER_ENABLE, nv4097::set_render_mode>();
 		bind<NV4097_SET_ZCULL_EN, nv4097::set_zcull_render_enable>();
 		bind<NV4097_SET_ZCULL_STATS_ENABLE, nv4097::set_zcull_stats_enable>();
--- a/rpcs3/Emu/RSX/rsx_vertex_data.h
+++ b/rpcs3/Emu/RSX/rsx_vertex_data.h
@ -60,8 +60,8 @@ public:

 struct push_buffer_vertex_info
 {
-	u8 size;
-	vertex_base_type type;
+	u8 size = 0;
+	vertex_base_type type = vertex_base_type::f;

 	u32 vertex_count = 0;
 	u32 attribute_mask = ~0;
@ -72,6 +72,7 @@ struct push_buffer_vertex_info
 		data.resize(0);
 		attribute_mask = ~0;
 		vertex_count = 0;
+		size = 0;
 	}

 	u8 get_vertex_size_in_dwords(vertex_base_type type)