diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index c24afaf5af..8b3122cb82 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -668,13 +668,19 @@ texture_memory_info upload_texture_subresource(gsl::span dst_buffer, { result.require_swap = true; result.element_size = word_size; + result.block_length = words_per_block; if (word_size == 2) { - const bool skip_swizzle = ((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle; - if (is_swizzled && skip_swizzle) result.require_deswizzle = true; + if (is_swizzled) + { + if (((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle) + { + result.require_deswizzle = true; + } + } - if (is_swizzled && !skip_swizzle) + if (is_swizzled && !result.require_deswizzle) copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); else copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index d5e0808de4..e15858e4e2 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -104,6 +104,7 @@ struct rsx_subresource_layout struct texture_memory_info { int element_size; + int block_length; bool require_swap; bool require_deswizzle; }; diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index eaecc7596a..72a7e2b60b 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -2,7 +2,7 @@ #include "VKHelpers.h" #include "Utilities/StrUtil.h" -#define VK_MAX_COMPUTE_TASKS 1024 // Max number of jobs per frame +#define VK_MAX_COMPUTE_TASKS 32768 // Max number of jobs per frame namespace vk { @@ -22,7 +22,9 @@ namespace vk bool initialized = false; bool unroll_loops = true; bool uniform_inputs = false; + bool use_push_constants = false; u32 ssbo_count = 1; + u32 push_constants_size = 0; u32 optimal_group_size = 1; u32 optimal_kernel_size = 1; @@ -77,6 +79,16 @@ namespace vk layout_info.setLayoutCount = 1; layout_info.pSetLayouts = &m_descriptor_layout; + VkPushConstantRange push_constants{}; + if (use_push_constants) + { + push_constants.size = push_constants_size; + push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + layout_info.pushConstantRangeCount = 1; + layout_info.pPushConstantRanges = &push_constants; + } + CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout)); } @@ -258,7 +270,7 @@ namespace vk "\n" "void main()\n" "{\n" - " uint index = %idx;\n" + " uint index = gl_GlobalInvocationID.x;\n" " uint value;\n" " %vars" "\n"; @@ -550,19 +562,26 @@ namespace vk }; // Reverse morton-order block arrangement + struct cs_deswizzle_base : compute_task + { + virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) = 0; + }; + template - struct cs_deswizzle_3d : compute_task + struct cs_deswizzle_3d : cs_deswizzle_base { union params_t { - u32 data[4]; + u32 data[6]; struct { u32 width; u32 height; + u32 depth; u32 logw; u32 logh; + u32 logd; }; } params; @@ -578,25 +597,29 @@ namespace vk verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0; ssbo_count = 2; - uniform_inputs = true; + use_push_constants = true; + push_constants_size = 24; + create(); m_src = "#version 450\n" - "layout(local_size_x = 8, local_size_y = 8, local_size_z = 1)\n\n" + "layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n\n" - "layout(set=0, binding=0, std430) buffer ssbo{ uint data_in[]; }\n" - "layout(set=0, binding=1, std430) buffer ssbo{ uint data_out[]; }\n" - "layout(set=0, binding=2, std140) uniform buffer parameters\n" + "layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n" + "layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n" + "layout(push_constant) uniform parameters\n" "{\n" " uint image_width;\n" " uint image_height;\n" + " uint image_depth;\n" " uint image_logw;\n" " uint image_logh;\n" + " uint image_logd;\n" "};\n\n" - "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" - "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" + "#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n" + "#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n" "uint get_z_index(uint x, uint y, uint z, uint log2w, uint log2h, uint log2d)\n" "{\n" @@ -629,26 +652,29 @@ namespace vk " log2d--;\n" " }\n" " }\n" - " while(x > 0 || y > 0 || z > 0)\n" + " while(x > 0 || y > 0 || z > 0);\n" "\n" " return offset;\n" "}\n\n" "void main()\n" "{\n" - " if (gl_GlobalInvocationID.x >= image_width || gl_GlobalInvocationID.y >= image_height)\n" + " if (any(greaterThanEqual(gl_GlobalInvocationID, uvec3(image_width, image_height, image_depth))))\n" " return;\n\n" - " uint texel_id = (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x" + " uint texel_id = (gl_GlobalInvocationID.z * image_width * image_height) + (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x;\n" " uint word_count = %_wordcount;\n" - " uint dst_id = (index * word_count);\n\n" + " uint dst_id = (texel_id * word_count);\n\n" + + " uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z, image_logw, image_logh, image_logd);\n" + " src_id *= word_count;\n\n" - " uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocation.y, 0, image_logw, image_logh, 0);\n" " for (uint i = 0; i < word_count; ++i)\n" " {\n" - " data_out[dst_id++] = %f(data_in[src_id++]);\n" + " uint value = data_in[src_id++];\n" + " data_out[dst_id++] = %f(value);\n" " }\n\n" - + "}\n"; std::string transform; @@ -681,24 +707,14 @@ namespace vk { m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); - m_program->bind_buffer({ m_param_buffer->value, 0, 16 }, 2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, m_descriptor_set); } void set_parameters(VkCommandBuffer cmd) { - verify(HERE), uniform_inputs; - - if (!m_param_buffer) - { - auto pdev = vk::get_current_renderer(); - m_param_buffer = std::make_unique(*pdev, 256, pdev->get_memory_mapping().host_visible_coherent, - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); - } - - vkCmdUpdateBuffer(cmd, m_param_buffer->value, 0, 16, params.data); + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, 24, params.data); } - void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) + void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) override { dst_buffer = dst; src_buffer = src; @@ -708,14 +724,16 @@ namespace vk this->block_length = sizeof(_BlockType) * width * height * depth; params.width = width; - params.height = height * depth; + params.height = height; + params.depth = depth; params.logw = rsx::ceil_log2(width); params.logh = rsx::ceil_log2(height); - set_parameters(); + params.logd = rsx::ceil_log2(depth); + set_parameters(cmd); const u32 invocations_x = align(params.width, 8) / 8; const u32 invocations_y = align(params.height, 8) / 8; - compute_task::run(cmd, invocations_x, invocations_y, 1); + compute_task::run(cmd, invocations_x, invocations_y, depth); } }; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index dbaece5b80..bc84e91b51 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -3299,6 +3299,7 @@ public: std::string shader_type = type == ::glsl::program_domain::glsl_vertex_program ? "vertex" : type == ::glsl::program_domain::glsl_fragment_program ? "fragment" : "compute"; + LOG_NOTICE(RSX, "%s", m_source); fmt::throw_exception("Failed to compile %s shader" HERE, shader_type); } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index d0a60a3158..4fe7944db3 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -538,6 +538,90 @@ namespace vk change_image_layout(cmd, dst, preferred_dst_format, dstLayout, vk::get_image_subresource_range(0, 0, 1, 1, aspect)); } + void gpu_deswizzle_sections_impl(VkCommandBuffer cmd, vk::buffer* scratch_buf, u32 dst_offset, int word_size, int word_count, bool swap_bytes, std::vector& sections) + { + // NOTE: This has to be done individually for every LOD + vk::cs_deswizzle_base* job = nullptr; + const auto block_size = (word_size * word_count); + + verify(HERE), word_size == 4 || word_size == 2; + + if (!swap_bytes) + { + if (word_size == 4) + { + switch (block_size) + { + case 4: + job = vk::get_compute_task>(); + break; + case 8: + job = vk::get_compute_task>(); + break; + case 16: + job = vk::get_compute_task>(); + break; + } + } + else + { + switch (block_size) + { + case 4: + job = vk::get_compute_task>(); + break; + case 8: + job = vk::get_compute_task>(); + break; + } + } + } + else + { + if (word_size == 4) + { + switch (block_size) + { + case 4: + job = vk::get_compute_task>(); + break; + case 8: + job = vk::get_compute_task>(); + break; + case 16: + job = vk::get_compute_task>(); + break; + } + } + else + { + switch (block_size) + { + case 4: + job = vk::get_compute_task>(); + break; + case 8: + job = vk::get_compute_task>(); + break; + } + } + } + + verify(HERE), job; + + for (auto §ion : sections) + { + job->run(cmd, scratch_buf, dst_offset, scratch_buf, section.bufferOffset, + section.imageExtent.width, section.imageExtent.height, section.imageExtent.depth); + + const u32 packed_size = section.imageExtent.width * section.imageExtent.height * section.imageExtent.depth * block_size; + section.bufferOffset = dst_offset; + dst_offset += packed_size; + } + + verify(HERE), dst_offset <= scratch_buf->size(); + } + void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image, const std::vector& subresource_layout, int format, bool is_swizzled, u16 mipmap_count, VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align) @@ -600,7 +684,7 @@ namespace vk copy_info.imageSubresource.mipLevel = layout.level; copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes; - if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) { if (!scratch_buf) { @@ -623,7 +707,7 @@ namespace vk } } - if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) { verify(HERE), scratch_buf; vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, (u32)buffer_copies.size(), buffer_copies.data()); @@ -632,8 +716,12 @@ namespace vk VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); } - // Swap if requested - if (opt.require_swap) + // Swap and swizzle if requested + if (opt.require_deswizzle) + { + gpu_deswizzle_sections_impl(cmd, scratch_buf, scratch_offset, opt.element_size, opt.block_length, opt.require_swap, copy_regions); + } + else if (opt.require_swap) { if (opt.element_size == 4) { @@ -658,9 +746,12 @@ namespace vk vk::copy_buffer_to_image(cmd, scratch_buf, dst_image, *rIt); } } - else if (opt.require_swap) + else if (scratch_buf) { - insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + verify(HERE), opt.require_deswizzle || opt.require_swap; + + const auto block_start = copy_regions.front().bufferOffset; + insert_buffer_memory_barrier(cmd, scratch_buf->value, block_start, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, (u32)copy_regions.size(), copy_regions.data());