diff --git a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h index cca89f5c86..446a3b749a 100644 --- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h +++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h @@ -109,7 +109,7 @@ public: bool is_critical() { const size_t guard_length = std::max(m_min_guard_size, m_largest_allocated_pool); - return (m_current_allocated_size + guard_length) > m_size; + return (m_current_allocated_size + guard_length) >= m_size; } void reset_allocation_stats() diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.cpp b/rpcs3/Emu/RSX/GL/GLHelpers.cpp index b01e042ec1..588aee1492 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp +++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp @@ -35,10 +35,8 @@ namespace gl switch (type) { case GL_DEBUG_TYPE_ERROR: - { LOG_ERROR(RSX, "%s", message); return; - } default: LOG_WARNING(RSX, "%s", message); return; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 3f14b5dca9..08e8ec8745 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -603,7 +603,7 @@ VKGSRender::VKGSRender() : GSRender() m_uniform_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0)); m_index_buffer_ring_info.init(VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer"); m_index_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0)); - m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 0x400000); + m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000); m_texture_upload_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, 0)); for (auto &ctx : frame_context_storage) @@ -627,6 +627,9 @@ VKGSRender::VKGSRender() : GSRender() m_depth_converter.reset(new vk::depth_convert_pass()); m_depth_converter->create(*m_device); + m_depth_scaler.reset(new vk::depth_scaling_pass()); + m_depth_scaler->create(*m_device); + m_prog_buffer.reset(new VKProgramBuffer(m_render_passes.data())); if (g_cfg.video.disable_vertex_cache) @@ -750,6 +753,10 @@ VKGSRender::~VKGSRender() m_depth_converter->destroy(); m_depth_converter.reset(); + //Depth surface blitter + m_depth_scaler->destroy(); + m_depth_scaler.reset(); + //Pipeline descriptors vkDestroyPipelineLayout(*m_device, pipeline_layout, nullptr); vkDestroyDescriptorSetLayout(*m_device, descriptor_layouts, nullptr); @@ -884,29 +891,8 @@ void VKGSRender::notify_tile_unbound(u32 tile) } } -void VKGSRender::begin() +void VKGSRender::check_heap_status() { - rsx::thread::begin(); - - if (skip_frame || renderer_unavailable || - (conditional_render_enabled && conditional_render_test_failed)) - return; - - init_buffers(rsx::framebuffer_creation_context::context_draw); - - if (!framebuffer_status_valid) - return; - - //Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources - if (m_current_frame->used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS) - { - //No need to stall if we have more than one frame queue anyway - flush_command_queue(); - - CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0)); - m_current_frame->used_descriptors = 0; - } - if (m_attrib_ring_info.is_critical() || m_texture_upload_buffer_ring_info.is_critical() || m_uniform_buffer_ring_info.is_critical() || @@ -953,6 +939,32 @@ void VKGSRender::begin() std::chrono::time_point submit_end = steady_clock::now(); m_flip_time += std::chrono::duration_cast(submit_end - submit_start).count(); } +} + +void VKGSRender::begin() +{ + rsx::thread::begin(); + + if (skip_frame || renderer_unavailable || + (conditional_render_enabled && conditional_render_test_failed)) + return; + + init_buffers(rsx::framebuffer_creation_context::context_draw); + + if (!framebuffer_status_valid) + return; + + //Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources + if (m_current_frame->used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS) + { + //No need to stall if we have more than one frame queue anyway + flush_command_queue(); + + CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0)); + m_current_frame->used_descriptors = 0; + } + + check_heap_status(); VkDescriptorSetAllocateInfo alloc_info = {}; alloc_info.descriptorPool = m_current_frame->descriptor_pool; @@ -1994,6 +2006,7 @@ void VKGSRender::process_swap_request(frame_context_t *ctx, bool free_resources) } m_depth_converter->free_resources(); + m_depth_scaler->free_resources(); m_ui_renderer->free_resources(); ctx->buffer_views_to_clean.clear(); @@ -2736,7 +2749,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context) const u32 range = pitch * m_depth_surface_info.height * aa_factor; m_texture_cache.lock_memory_region(std::get<1>(m_rtts.m_bound_depth_stencil), m_depth_surface_info.address, range, - m_depth_surface_info.width, m_depth_surface_info.height, m_depth_surface_info.pitch, gcm_format, true); + m_depth_surface_info.width, m_depth_surface_info.height, m_depth_surface_info.pitch, gcm_format, false); } } @@ -3165,11 +3178,39 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst if (renderer_unavailable) return false; + //Verify enough memory exists before attempting to handle data transfer + check_heap_status(); + + //Stop all parallel operations until this is finished + std::lock_guard lock(m_secondary_cb_guard); + auto result = m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer); m_current_command_buffer->begin(); + if (auto deferred_op_dst = std::get<1>(result)) + { + //Requires manual scaling; depth/stencil surface + auto deferred_op_src = std::get<2>(result); + auto src_view = std::get<3>(result); + + auto rp = vk::get_render_pass_location(VK_FORMAT_UNDEFINED, deferred_op_dst->info.format, 0); + auto render_pass = m_render_passes[rp]; + + auto old_src_layout = deferred_op_src->current_layout; + auto old_dst_layout = deferred_op_dst->current_layout; + + vk::change_image_layout(*m_current_command_buffer, deferred_op_src, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + vk::change_image_layout(*m_current_command_buffer, deferred_op_dst, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); + + m_depth_scaler->run(*m_current_command_buffer, deferred_op_dst->width(), deferred_op_dst->height(), deferred_op_dst, + src_view, render_pass, m_framebuffers_to_clean); + + vk::change_image_layout(*m_current_command_buffer, deferred_op_src, old_src_layout); + vk::change_image_layout(*m_current_command_buffer, deferred_op_dst, old_dst_layout); + } + m_samplers_dirty.store(true); - return result; + return std::get<0>(result); } void VKGSRender::clear_zcull_stats(u32 type) diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 4ecb9daa2c..063648c228 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -252,6 +252,7 @@ private: std::unique_ptr m_text_writer; std::unique_ptr m_depth_converter; + std::unique_ptr m_depth_scaler; std::unique_ptr m_ui_renderer; std::mutex m_sampler_mutex; @@ -376,6 +377,8 @@ private: void update_draw_state(); + void check_heap_status(); + /// returns primitive topology, index_count, allocated_verts, vertex_base_index, (offset in index buffer, index type) std::tuple > > upload_vertex_data(); public: diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index 3cca60a1bb..6a14d37ce9 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -388,6 +388,26 @@ namespace vk image->current_layout = new_layout; } + void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout) + { + if (image->current_layout == new_layout) return; + + VkImageAspectFlags flags = VK_IMAGE_ASPECT_COLOR_BIT; + switch (image->info.format) + { + case VK_FORMAT_D16_UNORM: + flags = VK_IMAGE_ASPECT_DEPTH_BIT; + break; + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + flags = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + break; + } + + change_image_layout(cmd, image->value, image->current_layout, new_layout, { flags, 0, 1, 0, 1 }); + image->current_layout = new_layout; + } + void insert_texture_barrier(VkCommandBuffer cmd, VkImage image, VkImageLayout layout, VkImageSubresourceRange range) { VkImageMemoryBarrier barrier = {}; @@ -419,7 +439,9 @@ namespace vk { if (image->info.usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { - insert_texture_barrier(cmd, image->value, image->current_layout, { VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1 }); + VkImageAspectFlags aspect = VK_IMAGE_ASPECT_DEPTH_BIT; + if (image->info.format != VK_FORMAT_D16_UNORM) aspect |= VK_IMAGE_ASPECT_STENCIL_BIT; + insert_texture_barrier(cmd, image->value, image->current_layout, { aspect, 0, 1, 0, 1 }); } else { diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index fd509133e4..f40488c305 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -82,6 +82,7 @@ namespace vk void change_image_layout(VkCommandBuffer cmd, VkImage image, VkImageLayout current_layout, VkImageLayout new_layout, VkImageSubresourceRange range); void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout, VkImageSubresourceRange range); + void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout); void copy_image(VkCommandBuffer cmd, VkImage &src, VkImage &dst, VkImageLayout srcLayout, VkImageLayout dstLayout, u32 width, u32 height, u32 mipmaps, VkImageAspectFlagBits aspect); void copy_scaled_image(VkCommandBuffer cmd, VkImage &src, VkImage &dst, VkImageLayout srcLayout, VkImageLayout dstLayout, u32 src_x_offset, u32 src_y_offset, u32 src_width, u32 src_height, u32 dst_x_offset, u32 dst_y_offset, u32 dst_width, u32 dst_height, u32 mipmaps, VkImageAspectFlagBits aspect, bool compatible_formats); diff --git a/rpcs3/Emu/RSX/VK/VKOverlays.h b/rpcs3/Emu/RSX/VK/VKOverlays.h index 6212a2f143..411c7ac9e6 100644 --- a/rpcs3/Emu/RSX/VK/VKOverlays.h +++ b/rpcs3/Emu/RSX/VK/VKOverlays.h @@ -716,4 +716,42 @@ namespace vk ui.update(); } }; + + struct depth_scaling_pass : public overlay_pass + { + depth_scaling_pass() + { + vs_src = + { + "#version 450\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(location=0) out vec2 tc0;\n" + "\n" + "void main()\n" + "{\n" + " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" + " vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n" + " gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n" + " tc0 = coords[gl_VertexIndex % 4];\n" + "}\n" + }; + + fs_src = + { + "#version 420\n" + "#extension GL_ARB_separate_shader_objects : enable\n" + "layout(set=0, binding=0) uniform sampler2D fs0;\n" + "layout(location=0) in vec2 tc0;\n" + "\n" + "void main()\n" + "{\n" + " gl_FragDepth = texture(fs0, tc0).x;\n" + "}\n" + }; + + renderpass_config.write_color = false; + m_vertex_shader.id = 100006; + m_fragment_shader.id = 100007; + } + }; } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 8826906a61..6a58038d72 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -183,15 +183,16 @@ namespace vk break; } + //TODO: Read back stencil values (is this really necessary?) VkBufferImageCopy copyRegion = {}; copyRegion.bufferOffset = 0; copyRegion.bufferRowLength = internal_width; copyRegion.bufferImageHeight = internal_height; - copyRegion.imageSubresource = {aspect_flag, 0, 0, 1}; + copyRegion.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1}; copyRegion.imageOffset = {}; copyRegion.imageExtent = {internal_width, internal_height, 1}; - VkImageSubresourceRange subresource_range = { aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 1, 0, 1 }; + VkImageSubresourceRange subresource_range = { aspect_flag, 0, 1, 0, 1 }; VkImageLayout layout = vram_texture->current_layout; change_image_layout(cmd, vram_texture, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range); @@ -235,11 +236,7 @@ namespace vk } else { - auto typed_dst = (T *)pixels_dst; - auto typed_src = (T *)pixels_src; - - for (u32 px = 0; px < block_size; ++px) - typed_dst[px] = typed_src[px]; + memcpy(pixels_dst, pixels_src, block_size * sizeof(T)); } } } @@ -273,38 +270,55 @@ namespace vk //We have to do our own byte swapping since the driver doesnt do it for us if (real_pitch == rsx_pitch) { - switch (bpp) + bool is_depth_format = true; + switch (vram_texture->info.format) { + case VK_FORMAT_D32_SFLOAT_S8_UINT: + rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_src, cpu_address_range >> 2, 1); + break; + case VK_FORMAT_D24_UNORM_S8_UINT: + rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_src, cpu_address_range >> 2, 1); + break; default: - LOG_ERROR(RSX, "Invalid bpp %d", bpp); - case 1: - do_memory_transfer(pixels_dst, pixels_src); - break; - case 2: - if (pack_unpack_swap_bytes) - do_memory_transfer(pixels_dst, pixels_src); - else - do_memory_transfer(pixels_dst, pixels_src); - break; - case 4: - if (pack_unpack_swap_bytes) - do_memory_transfer(pixels_dst, pixels_src); - else - do_memory_transfer(pixels_dst, pixels_src); - break; - case 8: - if (pack_unpack_swap_bytes) - do_memory_transfer(pixels_dst, pixels_src); - else - do_memory_transfer(pixels_dst, pixels_src); - break; - case 16: - if (pack_unpack_swap_bytes) - do_memory_transfer(pixels_dst, pixels_src); - else - do_memory_transfer(pixels_dst, pixels_src); + is_depth_format = false; break; } + + if (!is_depth_format) + { + switch (bpp) + { + default: + LOG_ERROR(RSX, "Invalid bpp %d", bpp); + case 1: + do_memory_transfer(pixels_dst, pixels_src); + break; + case 2: + if (pack_unpack_swap_bytes) + do_memory_transfer(pixels_dst, pixels_src); + else + do_memory_transfer(pixels_dst, pixels_src); + break; + case 4: + if (pack_unpack_swap_bytes) + do_memory_transfer(pixels_dst, pixels_src); + else + do_memory_transfer(pixels_dst, pixels_src); + break; + case 8: + if (pack_unpack_swap_bytes) + do_memory_transfer(pixels_dst, pixels_src); + else + do_memory_transfer(pixels_dst, pixels_src); + break; + case 16: + if (pack_unpack_swap_bytes) + do_memory_transfer(pixels_dst, pixels_src); + else + do_memory_transfer(pixels_dst, pixels_src); + break; + } + } } else { @@ -324,6 +338,16 @@ namespace vk } rsx::scale_image_nearest(pixels_dst, pixels_src, width, height, rsx_pitch, real_pitch, bpp, samples_u, samples_v, pack_unpack_swap_bytes); + + switch (vram_texture->info.format) + { + case VK_FORMAT_D32_SFLOAT_S8_UINT: + rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_dst, cpu_address_range >> 2, 1); + break; + case VK_FORMAT_D24_UNORM_S8_UINT: + rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_dst, cpu_address_range >> 2, 1); + break; + } } dma_buffer->unmap(); @@ -690,6 +714,7 @@ namespace vk VkImageAspectFlags aspect_flags; VkImageType image_type; VkImageViewType image_view_type; + VkImageUsageFlags usage_flags = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; u8 layer = 0; switch (type) @@ -724,10 +749,12 @@ namespace vk { case CELL_GCM_TEXTURE_DEPTH24_D8: aspect_flags = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + usage_flags |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; vk_format = m_formats_support.d24_unorm_s8? VK_FORMAT_D24_UNORM_S8_UINT : VK_FORMAT_D32_SFLOAT_S8_UINT; break; case CELL_GCM_TEXTURE_DEPTH16: aspect_flags = VK_IMAGE_ASPECT_DEPTH_BIT; + usage_flags |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; vk_format = VK_FORMAT_D16_UNORM; break; default: @@ -740,8 +767,7 @@ namespace vk image_type, vk_format, width, height, depth, mipmaps, layer, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0); + VK_IMAGE_TILING_OPTIMAL, usage_flags, is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0); mapping = apply_component_mapping_flags(gcm_format, flags, remap_vector); @@ -768,7 +794,7 @@ namespace vk { //TODO: Confirm byte swap patterns region.protect(utils::protection::no); - region.set_unpack_swap_bytes(true); + region.set_unpack_swap_bytes((aspect_flags & VK_IMAGE_ASPECT_COLOR_BIT) == VK_IMAGE_ASPECT_COLOR_BIT); no_access_range = region.get_min_max(no_access_range); } @@ -954,12 +980,16 @@ namespace vk return upload_texture(cmd, tex, m_rtts, cmd, m_memory_types, const_cast(m_submit_queue)); } - bool blit(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, rsx::vk_render_targets& m_rtts, vk::command_buffer& cmd) + std::tuple blit(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, rsx::vk_render_targets& m_rtts, vk::command_buffer& cmd) { struct blit_helper { vk::command_buffer* commands; blit_helper(vk::command_buffer *c) : commands(c) {} + + vk::image* deferred_op_src = nullptr; + vk::image* deferred_op_dst = nullptr; + void scale_image(vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool /*interpolate*/, bool is_depth) { VkImageAspectFlagBits aspect = VK_IMAGE_ASPECT_COLOR_BIT; @@ -984,15 +1014,44 @@ namespace vk return; } - copy_scaled_image(*commands, src->value, dst->value, src->current_layout, dst->current_layout, src_area.x1, src_area.y1, src_area.x2 - src_area.x1, src_area.y2 - src_area.y1, - dst_area.x1, dst_area.y1, dst_area.x2 - dst_area.x1, dst_area.y2 - dst_area.y1, 1, aspect, src->info.format == dst->info.format); + const auto src_width = src_area.x2 - src_area.x1; + const auto src_height = src_area.y2 - src_area.y1; + const auto dst_width = dst_area.x2 - dst_area.x1; + const auto dst_height = dst_area.y2 - dst_area.y1; + + if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) + { + if (src_width != dst_width || src_height != dst_height || src->info.format != dst->info.format) + { + //Scaled depth scaling + deferred_op_src = src; + deferred_op_dst = dst; + } + } + + if (!deferred_op_src) + { + copy_scaled_image(*commands, src->value, dst->value, src->current_layout, dst->current_layout, src_area.x1, src_area.y1, src_width, src_height, + dst_area.x1, dst_area.y1, dst_width, dst_height, 1, aspect, src->info.format == dst->info.format); + } change_image_layout(*commands, dst, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, {(VkImageAspectFlags)aspect, 0, dst->info.mipLevels, 0, dst->info.arrayLayers}); } } helper(&cmd); - return upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper, cmd, m_memory_types, const_cast(m_submit_queue)); + bool reply = upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper, cmd, m_memory_types, const_cast(m_submit_queue)); + + if (helper.deferred_op_src == nullptr) + return std::make_tuple(reply, nullptr, nullptr, nullptr); + + VkImageSubresourceRange view_range = { VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1 }; + auto tmp_view = std::make_unique(*vk::get_current_renderer(), helper.deferred_op_src->value, VK_IMAGE_VIEW_TYPE_2D, + helper.deferred_op_src->info.format, helper.deferred_op_src->native_component_map, view_range); + + auto src_view = tmp_view.get(); + m_discardable_storage.push_back(tmp_view); + return std::make_tuple(reply, helper.deferred_op_dst, helper.deferred_op_src, src_view); } const u32 get_unreleased_textures_count() const override diff --git a/rpcs3/Emu/RSX/rsx_utils.cpp b/rpcs3/Emu/RSX/rsx_utils.cpp index cd73e3c9bc..c6a6035db7 100644 --- a/rpcs3/Emu/RSX/rsx_utils.cpp +++ b/rpcs3/Emu/RSX/rsx_utils.cpp @@ -4,6 +4,7 @@ #include "Emu/RSX/GCM.h" #include "Common/BufferUtils.h" #include "overlays.h" +#include "Utilities/sysinfo.h" extern "C" { @@ -363,4 +364,113 @@ namespace rsx } } } + + void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) + { + const u32 num_pixels = row_length_in_texels * num_rows; + verify(HERE), (num_pixels & 3) == 0; + + const auto num_iterations = (num_pixels >> 2); + + __m128i* dst_ptr = (__m128i*)dst; + __m128i* src_ptr = (__m128i*)src; + + const __m128 scale_vector = _mm_set1_ps(16777214.f); + +#if defined (_MSC_VER) || defined (__SSSE3__) + if (LIKELY(utils::has_ssse3())) + { + const __m128i swap_mask = _mm_set_epi8 + ( + 0xF, 0xC, 0xD, 0xE, + 0xB, 0x8, 0x9, 0xA, + 0x7, 0x4, 0x5, 0x6, + 0x3, 0x0, 0x1, 0x2 + ); + + for (u32 n = 0; n < num_iterations; ++n) + { + const __m128i src_vector = _mm_loadu_si128(src_ptr); + const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector)); + const __m128i shuffled_vector = _mm_shuffle_epi8(result, swap_mask); + _mm_stream_si128(dst_ptr, shuffled_vector); + ++dst_ptr; + ++src_ptr; + } + + return; + } +#endif + + const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); + const __m128i mask2 = _mm_set1_epi32(0x00FF0000); + const __m128i mask3 = _mm_set1_epi32(0x000000FF); + + for (u32 n = 0; n < num_iterations; ++n) + { + const __m128i src_vector = _mm_loadu_si128(src_ptr); + const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector)); + + const __m128i v1 = _mm_and_si128(result, mask1); + const __m128i v2 = _mm_and_si128(_mm_slli_epi32(result, 16), mask2); + const __m128i v3 = _mm_and_si128(_mm_srli_epi32(result, 16), mask3); + const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); + + _mm_stream_si128(dst_ptr, shuffled_vector); + ++dst_ptr; + ++src_ptr; + } + } + + void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) + { + const u32 num_pixels = row_length_in_texels * num_rows; + verify(HERE), (num_pixels & 3) == 0; + + const auto num_iterations = (num_pixels >> 2); + + __m128i* dst_ptr = (__m128i*)dst; + __m128i* src_ptr = (__m128i*)src; + +#if defined (_MSC_VER) || defined (__SSSE3__) + if (LIKELY(utils::has_ssse3())) + { + const __m128i swap_mask = _mm_set_epi8 + ( + 0xF, 0xC, 0xD, 0xE, + 0xB, 0x8, 0x9, 0xA, + 0x7, 0x4, 0x5, 0x6, + 0x3, 0x0, 0x1, 0x2 + ); + + for (u32 n = 0; n < num_iterations; ++n) + { + const __m128i src_vector = _mm_loadu_si128(src_ptr); + const __m128i shuffled_vector = _mm_shuffle_epi8(src_vector, swap_mask); + _mm_stream_si128(dst_ptr, shuffled_vector); + ++dst_ptr; + ++src_ptr; + } + + return; + } +#endif + + const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); + const __m128i mask2 = _mm_set1_epi32(0x00FF0000); + const __m128i mask3 = _mm_set1_epi32(0x000000FF); + + for (u32 n = 0; n < num_iterations; ++n) + { + const __m128i src_vector = _mm_loadu_si128(src_ptr); + const __m128i v1 = _mm_and_si128(src_vector, mask1); + const __m128i v2 = _mm_and_si128(_mm_slli_epi32(src_vector, 16), mask2); + const __m128i v3 = _mm_and_si128(_mm_srli_epi32(src_vector, 16), mask3); + const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); + + _mm_stream_si128(dst_ptr, shuffled_vector); + ++dst_ptr; + ++src_ptr; + } + } } diff --git a/rpcs3/Emu/RSX/rsx_utils.h b/rpcs3/Emu/RSX/rsx_utils.h index 929d0a85c2..300384c3a9 100644 --- a/rpcs3/Emu/RSX/rsx_utils.h +++ b/rpcs3/Emu/RSX/rsx_utils.h @@ -169,6 +169,9 @@ namespace rsx void clip_image(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch); void clip_image(std::unique_ptr& dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch); + void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows); + void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows); + void fill_scale_offset_matrix(void *dest_, bool transpose, float offset_x, float offset_y, float offset_z, float scale_x, float scale_y, float scale_z);