diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 0d6021365e..23752997af 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -300,7 +300,7 @@ namespace gl m_src = fmt::replace_all(m_src, syntax_replace); - param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY); + param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update); } ~cs_deswizzle_3d() diff --git a/rpcs3/Emu/RSX/GL/GLDMA.cpp b/rpcs3/Emu/RSX/GL/GLDMA.cpp index a1373e4a83..af089f3c7d 100644 --- a/rpcs3/Emu/RSX/GL/GLDMA.cpp +++ b/rpcs3/Emu/RSX/GL/GLDMA.cpp @@ -19,8 +19,11 @@ namespace gl void* userptr = vm::get_super_ptr(base_address); m_data = std::make_unique(); - m_data->create(buffer::target::userptr, block_size, userptr); + m_data->create(buffer::target::array, block_size, userptr, buffer::memory_type::userptr, 0); m_base_address = base_address; + + // Some drivers may reject userptr input for whatever reason. Check that the state is still valid. + gl::check_state(); } void* dma_block::map(const utils::address_range& range) const @@ -69,8 +72,8 @@ namespace gl utils::address_range to_dma_block_range(u32 start, u32 length) { - const auto start_block_address = start & ~s_dma_block_size; - const auto end_block_address = (start + length - 1) & ~s_dma_block_size; + const auto start_block_address = start & -s_dma_block_size; + const auto end_block_address = (start + length + s_dma_block_size - 1) & -s_dma_block_size; return utils::address_range::start_end(start_block_address, end_block_address); } @@ -81,7 +84,7 @@ namespace gl if (!block) { block = std::make_unique(); - block->allocate(block_range.start, length); + block->allocate(block_range.start, block_range.length()); return *block; } @@ -96,6 +99,7 @@ namespace gl const auto search_end = (block_range.end + 1); // 1. Resize to new length + ensure((new_length & -s_dma_block_size) == new_length); auto new_owner = std::make_unique(); new_owner->allocate(owner->base_addr(), new_length); diff --git a/rpcs3/Emu/RSX/GL/GLDMA.h b/rpcs3/Emu/RSX/GL/GLDMA.h index 9a1d1289fa..1e4b31bae0 100644 --- a/rpcs3/Emu/RSX/GL/GLDMA.h +++ b/rpcs3/Emu/RSX/GL/GLDMA.h @@ -24,7 +24,7 @@ namespace gl void* map(const utils::address_range& range) const; void set_parent(const dma_block* other); - const dma_block* head() const { return m_parent; } + const dma_block* head() const { return m_parent ? m_parent : this; } bool can_map(const utils::address_range& range) const; u32 base_addr() const { return m_base_address; } diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 1d083fce7c..ac3179fcba 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -181,18 +181,18 @@ void GLGSRender::on_init_thread() backend_config.supports_normalized_barycentrics = false; } - if (gl_caps.AMD_pinned_memory) + if (gl_caps.AMD_pinned_memory && g_cfg.video.host_label_synchronization) { backend_config.supports_host_gpu_labels = true; - if (g_cfg.video.host_label_synchronization) - { - m_host_gpu_context_data = std::make_unique(); - m_host_gpu_context_data->create(gl::buffer::target::array, 4096); + m_host_gpu_context_data = std::make_unique(); + m_host_gpu_context_data->create(gl::buffer::target::array, 4096, nullptr, gl::buffer::memory_type::host_visible, + gl::buffer::usage::host_read | gl::buffer::usage::host_write | gl::buffer::usage::persistent_map); - auto host_context_ptr = reinterpret_cast(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::read)); - m_host_dma_ctrl = std::make_unique(host_context_ptr); - } + auto host_context_ptr = reinterpret_cast(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::persistent_rw)); + m_host_dma_ctrl = std::make_unique(host_context_ptr); + m_enqueued_host_write_buffer = std::make_unique(); + m_enqueued_host_write_buffer->create(gl::buffer::target::array, 64 * 0x100000, gl::buffer::usage::dynamic_update); } // Use industry standard resource alignment values as defaults @@ -425,6 +425,7 @@ void GLGSRender::on_exit() m_host_dma_ctrl.reset(); m_host_gpu_context_data.reset(); + m_enqueued_host_write_buffer.reset(); for (auto &fbo : m_framebuffer_cache) { @@ -1222,6 +1223,66 @@ void GLGSRender::notify_tile_unbound(u32 tile) } } +bool GLGSRender::release_GCM_label(u32 address, u32 args) +{ + if (!backend_config.supports_host_gpu_labels) + { + return false; + } + + auto host_ctx = ensure(m_host_dma_ctrl->host_ctx()); + + if (host_ctx->texture_loads_completed()) + { + // We're about to poll waiting for GPU state, ensure the context is still valid. + gl::check_state(); + + // All texture loads already seen by the host GPU + // Wait for all previously submitted labels to be flushed + m_host_dma_ctrl->drain_label_queue(); + return false; + } + + const auto mapping = gl::map_dma(address, 4); + const auto write_data = std::bit_cast>(args); + const auto release_event_id = host_ctx->on_label_acquire(); + + // We don't have async texture loads yet, so just release both the label and the commands complete + u64 write_buf[2] = { write_data, release_event_id }; + const auto host_read_offset = m_enqueued_host_write_buffer->alloc(16, 16); + m_enqueued_host_write_buffer->get().sub_data(host_read_offset, 16, write_buf); + + // Now write to DMA and then to host context + m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4); + m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8); + m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16); + + host_ctx->on_label_release(); + return true; +} + +void GLGSRender::enqueue_host_context_write(u32 offset, u32 size, const void* data) +{ + ensure(size <= 8); + const u32 host_read_offset = m_enqueued_host_write_buffer->alloc(8, 16); + m_enqueued_host_write_buffer->get().sub_data(host_read_offset, size, data); + m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset, offset, size); + m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16); +} + +void GLGSRender::on_guest_texture_read() +{ + if (!backend_config.supports_host_gpu_labels) + { + return; + } + + // Tag the read as being in progress + u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter(); + m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id; + enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id); +} + void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query) { query->result = 0; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 8ea87f8e5d..866fe288e6 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -152,6 +152,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control // Host context for GPU-driven work std::unique_ptr m_host_gpu_context_data; + std::unique_ptr m_enqueued_host_write_buffer; public: u64 get_cycles() final; @@ -196,6 +197,11 @@ public: void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override; void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override; + // DMA + bool release_GCM_label(u32 address, u32 data) override; + void enqueue_host_context_write(u32 offset, u32 size, const void* data); + void on_guest_texture_read(); + // GRAPH backend void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override; diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 9a11e08c18..3847ca0e3a 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -3,6 +3,7 @@ #include "GLCompute.h" #include "GLRenderTargets.h" #include "GLOverlays.h" +#include "GLGSRender.h" #include "glutils/blitter.h" #include "glutils/ring_buffer.h" @@ -285,7 +286,7 @@ namespace gl if (!(*dst) || max_mem > static_cast(dst->size())) { if (*dst) dst->remove(); - dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); + dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, 0); } if (auto as_vi = dynamic_cast(src); @@ -400,7 +401,7 @@ namespace gl return; } - scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); + scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, 0); glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); src->copy_to(&scratch_mem, in_offset, 0, mem_info->image_size_in_bytes); @@ -835,6 +836,10 @@ namespace gl const GLenum gl_format = std::get<0>(format_type); const GLenum gl_type = std::get<1>(format_type); fill_texture(cmd, dst, gcm_format, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf); + + // Notify the renderer of the upload + auto renderer = static_cast(rsx::get_current_renderer()); + renderer->on_guest_texture_read(); } u32 get_format_texel_width(GLenum format) diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 36146d813d..fbb9b27a04 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -59,7 +59,7 @@ namespace gl pbo.remove(); } - pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, GL_STREAM_READ); + pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, buffer::usage::host_read); glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); } diff --git a/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp b/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp index 465ace0eee..284014f613 100644 --- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp @@ -3,38 +3,35 @@ namespace gl { - void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage) + void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_flags) { + m_memory_type = type; + if (const auto& caps = get_driver_caps(); - m_target != target::userptr && caps.ARB_buffer_storage_supported) + type != memory_type::userptr && caps.ARB_buffer_storage_supported) { GLenum flags = 0; - if (type == memory_type::host_visible) + if (usage_flags & usage::host_write) { - switch (usage) - { - case GL_STREAM_DRAW: - case GL_STATIC_DRAW: - case GL_DYNAMIC_DRAW: - flags |= GL_MAP_WRITE_BIT; - break; - case GL_STREAM_READ: - case GL_STATIC_READ: - case GL_DYNAMIC_READ: - flags |= GL_MAP_READ_BIT; - break; - default: - fmt::throw_exception("Unsupported buffer usage 0x%x", usage); - } + flags |= GL_MAP_WRITE_BIT; } - else + if (usage_flags & usage::host_read) { - // Local memory hints - if (usage == GL_DYNAMIC_COPY) - { - flags |= GL_DYNAMIC_STORAGE_BIT; - } + flags |= GL_MAP_READ_BIT; } + if (usage_flags & usage::persistent_map) + { + flags |= GL_MAP_PERSISTENT_BIT; + } + if (usage_flags & usage::dynamic_update) + { + flags |= GL_DYNAMIC_STORAGE_BIT; + } + + ensure((flags & (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT)) != (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT), + "Mutually exclusive usage flags set!"); + + ensure(type == memory_type::local || flags != 0, "Host-visible memory must have usage flags set!"); if ((flags & GL_MAP_READ_BIT) && !caps.vendor_AMD) { @@ -51,10 +48,8 @@ namespace gl } else { - data(size, data_, usage); + data(size, data_, GL_STREAM_COPY); } - - m_memory_type = type; } buffer::~buffer() @@ -89,18 +84,18 @@ namespace gl save_binding_state save(current_target(), *this); } - void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLenum usage) + void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits) { create(); - allocate(size, data_, type, usage); + allocate(size, data_, type, usage_bits); } - void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLenum usage) + void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits) { m_target = target_; create(); - allocate(size, data_, type, usage); + allocate(size, data_, type, usage_bits); } void buffer::remove() @@ -117,11 +112,19 @@ namespace gl { ensure(m_memory_type != memory_type::local); - DSA_CALL2(NamedBufferData, m_id, size, data_, usage); m_size = size; + + if (m_memory_type == memory_type::userptr) + { + glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_id); + glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, size, data_, usage); + return; + } + + DSA_CALL2(NamedBufferData, m_id, size, data_, usage); } - void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data) + void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data) { ensure(m_memory_type == memory_type::local); DSA_CALL2(NamedBufferSubData, m_id, offset, length, data); diff --git a/rpcs3/Emu/RSX/GL/glutils/buffer_object.h b/rpcs3/Emu/RSX/GL/glutils/buffer_object.h index 450ce37d9e..e559318be9 100644 --- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.h +++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.h @@ -15,28 +15,37 @@ namespace gl element_array = GL_ELEMENT_ARRAY_BUFFER, uniform = GL_UNIFORM_BUFFER, texture = GL_TEXTURE_BUFFER, - ssbo = GL_SHADER_STORAGE_BUFFER, - userptr = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD + ssbo = GL_SHADER_STORAGE_BUFFER }; enum class access { read = GL_MAP_READ_BIT, write = GL_MAP_WRITE_BIT, - read_write = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT + rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT, + persistent_rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT }; enum class memory_type { undefined = 0, local = 1, - host_visible = 2 + host_visible = 2, + userptr = 4 + }; + + enum usage + { + host_write = (1 << 0), + host_read = (1 << 1), + persistent_map = (1 << 2), + dynamic_update = (1 << 3), }; class save_binding_state { - GLint m_last_binding; - GLenum m_target; + GLint m_last_binding = GL_ZERO; + GLenum m_target = GL_NONE; public: save_binding_state(target target_, const buffer& new_state) : save_binding_state(target_) @@ -65,6 +74,11 @@ namespace gl ~save_binding_state() { + if (!m_target) + { + return; + } + glBindBuffer(m_target, m_last_binding); } }; @@ -78,7 +92,7 @@ namespace gl // Metadata mutable std::pair m_bound_range{}; - void allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage); + void allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits); public: buffer() = default; @@ -89,8 +103,8 @@ namespace gl void recreate(GLsizeiptr size, const void* data = nullptr); void create(); - void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW); - void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW); + void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0); + void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0); void remove(); @@ -98,7 +112,7 @@ namespace gl void bind() const { bind(current_target()); } void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW); - void sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data); + void sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data); GLubyte* map(GLsizeiptr offset, GLsizeiptr length, access access_); void unmap(); diff --git a/rpcs3/Emu/RSX/GL/glutils/common.h b/rpcs3/Emu/RSX/GL/glutils/common.h index 63e03cbab2..177de0d26f 100644 --- a/rpcs3/Emu/RSX/GL/glutils/common.h +++ b/rpcs3/Emu/RSX/GL/glutils/common.h @@ -79,4 +79,12 @@ namespace gl { glInsertEventMarkerEXT(static_cast(strlen(label)), label); } + + // Checks if GL state is still valid + void check_state() + { + // GL_OUT_OF_MEMORY invalidates the OpenGL context and is actually the GL version of DEVICE_LOST. + // This spec workaround allows it to be abused by ISVs to indicate a broken GL context. + ensure(glGetError() != GL_OUT_OF_MEMORY); + } } diff --git a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp index 553ead61f6..da77b50cbe 100644 --- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp @@ -242,14 +242,14 @@ namespace gl } } - void scratch_ring_buffer::create(buffer::target target_, u64 size) + void scratch_ring_buffer::create(buffer::target target_, u64 size, u32 usage_flags) { if (m_storage) { remove(); } - m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, GL_STATIC_COPY); + m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, usage_flags); } void scratch_ring_buffer::remove() diff --git a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h index b66f073de3..97f802ddf2 100644 --- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h +++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h @@ -103,7 +103,7 @@ namespace gl scratch_ring_buffer(const scratch_ring_buffer&) = delete; ~scratch_ring_buffer(); - void create(buffer::target _target, u64 size); + void create(buffer::target _target, u64 size, u32 usage_flags = 0); void remove(); u32 alloc(u32 size, u32 alignment); diff --git a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp index 7c9a8deff7..adf25b3f12 100644 --- a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp +++ b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp @@ -80,7 +80,7 @@ namespace gl if (!m_ubo) { ensure(compiled); - m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY); + m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update); // Statically bind the image sources m_program.uniforms["InputTexture"] = GL_TEMP_IMAGE_SLOT(0); diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h index 18d232bfda..aeae34080e 100644 --- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h +++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h @@ -27,7 +27,7 @@ namespace rsx inline bool in_flight_commands_completed() const volatile { - return last_label_release2_event == commands_complete_event; + return last_label_release2_event <= commands_complete_event; } inline bool texture_loads_completed() const volatile