diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index 2614446774..f7666d8b1c 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -469,48 +469,51 @@ std::vector get_subresources_layout(const rsx::vertex_te return get_subresources_layout_impl(texture); } -void upload_texture_subresource(gsl::span dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, bool vtc_support, size_t dst_row_pitch_multiple_of) +texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps) { u16 w = src_layout.width_in_block; u16 h = src_layout.height_in_block; u16 depth = src_layout.depth; u32 pitch = src_layout.pitch_in_block; + texture_memory_info result{}; + // Ignore when texture width > pitch if (w > pitch) - return; + return result; + + // Check if we can use a fast path + int word_size = 0; + int words_per_block; + u32 dst_pitch_in_block; - // NOTE: Avoid block optimization for formats that can be modified internally by the GPU itself - // Since the gpu code does not attempt to do wide translations (e.g WZYX32->XYZW32), only perform, per-channel transform and use proper swizzles to get the proper output switch (format) { case CELL_GCM_TEXTURE_B8: { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of)); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + word_size = words_per_block = 1; + dst_pitch_in_block = get_row_pitch_in_block(w, caps.alignment); break; } case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8: { - copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); break; } case CELL_GCM_TEXTURE_COMPRESSED_R8B8_R8G8: { - copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); break; } case CELL_GCM_TEXTURE_R6G5B5: { if (is_swizzled) - copy_rgb655_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of)); + copy_rgb655_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block(w, caps.alignment)); else - copy_rgb655_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_rgb655_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); break; } @@ -526,10 +529,9 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre case CELL_GCM_TEXTURE_R5G6B5: case CELL_GCM_TEXTURE_G8B8: { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of)); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + word_size = 2; + words_per_block = 1; + dst_pitch_in_block = get_row_pitch_in_block(w, caps.alignment); break; } @@ -538,10 +540,9 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre case CELL_GCM_TEXTURE_DEPTH24_D8: case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: // Untested { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of)); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + word_size = 4; + words_per_block = 1; + dst_pitch_in_block = get_row_pitch_in_block(w, caps.alignment); break; } @@ -554,13 +555,9 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre case CELL_GCM_TEXTURE_W16_Z16_Y16_X16_FLOAT: { const u16 block_size = get_format_block_size_in_bytes(format); - const u16 words_per_block = block_size / 2; - const auto dst_pitch_in_block = get_row_pitch_in_block(block_size, w, dst_row_pitch_multiple_of); - - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + word_size = 2; + words_per_block = block_size / 2; + dst_pitch_in_block = get_row_pitch_in_block(block_size, w, caps.alignment); break; } @@ -568,28 +565,24 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre case CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT: { const u16 block_size = get_format_block_size_in_bytes(format); - const u16 words_per_block = block_size / 4; - const auto dst_pitch_in_block = get_row_pitch_in_block(block_size, w, dst_row_pitch_multiple_of); - - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + word_size = 4; + words_per_block = block_size / 4; + dst_pitch_in_block = get_row_pitch_in_block(block_size, w, caps.alignment); break; } case CELL_GCM_TEXTURE_COMPRESSED_DXT1: { - if (depth > 1 && !vtc_support) + if (depth > 1 && !caps.supports_vtc_decoding) { // PS3 uses the Nvidia VTC memory layout for compressed 3D textures. // This is only supported using Nvidia OpenGL. // Remove the VTC tiling to support ATI and Vulkan. - copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } else { - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } break; } @@ -597,16 +590,16 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre case CELL_GCM_TEXTURE_COMPRESSED_DXT23: case CELL_GCM_TEXTURE_COMPRESSED_DXT45: { - if (depth > 1 && !vtc_support) + if (depth > 1 && !caps.supports_vtc_decoding) { // PS3 uses the Nvidia VTC memory layout for compressed 3D textures. // This is only supported using Nvidia OpenGL. // Remove the VTC tiling to support ATI and Vulkan. - copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } else { - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block); + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } break; } @@ -614,6 +607,56 @@ void upload_texture_subresource(gsl::span dst_buffer, const rsx_subre default: fmt::throw_exception("Wrong format 0x%x" HERE, format); } + + if (word_size) + { + if (word_size == 1) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (caps.supports_byteswap) + { + result.require_swap = true; + result.element_size = word_size; + + if (word_size == 2) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (word_size == 4) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + } + else + { + if (word_size == 2) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (word_size == 4) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + } + } + + return result; } /** diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index 531ffa3165..99126d6aee 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -110,6 +110,19 @@ struct rsx_subresource_layout u32 pitch_in_block; }; +struct texture_memory_info +{ + int element_size; + bool require_swap; +}; + +struct texture_uploader_capabilities +{ + bool supports_byteswap; + bool supports_vtc_decoding; + size_t alignment; +}; + /** * Get size to store texture in a linear fashion. * Storage is assumed to use a rowPitchAlignment boundary for every row of texture. @@ -125,7 +138,7 @@ size_t get_placed_texture_storage_size(const rsx::vertex_texture &texture, size_ std::vector get_subresources_layout(const rsx::fragment_texture &texture); std::vector get_subresources_layout(const rsx::vertex_texture &texture); -void upload_texture_subresource(gsl::span dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, bool vtc_support, size_t dst_row_pitch_multiple_of); +texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps); u8 get_format_block_size_in_bytes(int format); u8 get_format_block_size_in_texel(int format); diff --git a/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp b/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp index da83a6dc0b..f19a7b6df1 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp +++ b/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp @@ -115,7 +115,8 @@ namespace { size_t offset_in_buffer = 0; for (const rsx_subresource_layout &layout : input_layouts) { - upload_texture_subresource(mapped_buffer.subspan(offset_in_buffer), layout, format, is_swizzled, false, 256); + texture_uploader_capabilities caps{ false, false, 256 }; + upload_texture_subresource(mapped_buffer.subspan(offset_in_buffer), layout, format, is_swizzled, caps); UINT row_pitch = align(layout.width_in_block * block_size_in_bytes, 256); command_list->CopyTextureRegion(&CD3DX12_TEXTURE_COPY_LOCATION(existing_texture, (UINT)mip_level), 0, 0, 0, &CD3DX12_TEXTURE_COPY_LOCATION(texture_buffer_heap.get_heap(), diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index e060ce0660..ac5e1f5932 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -63,7 +63,7 @@ namespace gl case CELL_GCM_TEXTURE_A1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV); case CELL_GCM_TEXTURE_A4R4G4B4: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4); case CELL_GCM_TEXTURE_R5G6B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5); - case CELL_GCM_TEXTURE_A8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_A8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8); case CELL_GCM_TEXTURE_G8B8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE); case CELL_GCM_TEXTURE_R6G5B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5); case CELL_GCM_TEXTURE_DEPTH24_D8: return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8); @@ -370,7 +370,7 @@ namespace gl case CELL_GCM_TEXTURE_R5G5B5A1: case CELL_GCM_TEXTURE_R6G5B5: case CELL_GCM_TEXTURE_R5G6B5: - case CELL_GCM_TEXTURE_A8R8G8B8: // TODO + case CELL_GCM_TEXTURE_A8R8G8B8: case CELL_GCM_TEXTURE_COMPRESSED_DXT1: case CELL_GCM_TEXTURE_COMPRESSED_DXT23: case CELL_GCM_TEXTURE_COMPRESSED_DXT45: @@ -458,107 +458,122 @@ namespace gl const std::vector &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector& staging_buffer) { int mip_level = 0; - bool vtc_support = gl::get_driver_caps().vendor_NVIDIA; + texture_uploader_capabilities caps{ true, false, 4 }; - if (is_compressed_format(format)) + pixel_unpack_settings unpack_settings; + unpack_settings.row_length(0).alignment(4); + + if (LIKELY(is_compressed_format(format))) { //Compressed formats have a 4-byte alignment //TODO: Verify that samplers are not affected by the padding width = align(width, 4); height = align(height, 4); - } - if (dim == rsx::texture_dimension_extended::texture_dimension_1d) - { - if (!is_compressed_format(format)) + caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA; + unpack_settings.apply(); + + for (const rsx_subresource_layout& layout : input_layouts) { - for (const rsx_subresource_layout &layout : input_layouts) + upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps); + + switch (dim) { - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); - glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data()); - } - } - else - { - for (const rsx_subresource_layout &layout : input_layouts) + case rsx::texture_dimension_extended::texture_dimension_1d: { u32 size = layout.width_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, gl_format, size, staging_buffer.data()); + break; } - } - return; - } - - if (dim == rsx::texture_dimension_extended::texture_dimension_2d) - { - if (!is_compressed_format(format)) - { - for (const rsx_subresource_layout &layout : input_layouts) - { - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); - glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); - } - } - else - { - for (const rsx_subresource_layout &layout : input_layouts) + case rsx::texture_dimension_extended::texture_dimension_2d: { u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data()); + break; } - } - return; - } - - if (dim == rsx::texture_dimension_extended::texture_dimension_cubemap) - { - // Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap - // Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0. - // mip_level % mipmap_per_layer will always be equal to mip_level - if (!is_compressed_format(format)) - { - for (const rsx_subresource_layout &layout : input_layouts) - { - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); - glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); - mip_level++; - } - } - else - { - for (const rsx_subresource_layout &layout : input_layouts) + case rsx::texture_dimension_extended::texture_dimension_cubemap: { + // Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap + // Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0. + // mip_level % mipmap_per_layer will always be equal to mip_level u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data()); mip_level++; + break; } - } - return; - } - - if (dim == rsx::texture_dimension_extended::texture_dimension_3d) - { - if (!is_compressed_format(format)) - { - for (const rsx_subresource_layout &layout : input_layouts) - { - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); - glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data()); - } - } - else - { - for (const rsx_subresource_layout &layout : input_layouts) + case rsx::texture_dimension_extended::texture_dimension_3d: { u32 size = layout.width_in_block * layout.height_in_block * layout.depth * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16); - upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4); glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, gl_format, size, staging_buffer.data()); + break; + } + default: + { + ASSUME(0); + fmt::throw_exception("Unreachable" HERE); + } + } + } + } + else + { + bool apply_settings = true; + switch (gl_type) + { + case GL_UNSIGNED_INT_8_8_8_8: + // NOTE: GL_UNSIGNED_INT_8_8_8_8 is already a swapped type + // TODO: Remove reliance on format and type checks when compute acceleration is implemented + apply_settings = false; + break; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + // Multi-channel format uploaded one byte at a time. This is due to poor driver support for formats like GL_UNSIGNED SHORT_8_8 + // Do byteswapping in software for now until compute acceleration is available + apply_settings = (gl_format == GL_RED); + caps.supports_byteswap = apply_settings; + break; + default: + break; + } + + if (!apply_settings) + { + unpack_settings.apply(); + } + + for (const rsx_subresource_layout& layout : input_layouts) + { + auto op = upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps); + if (apply_settings) + { + unpack_settings.swap_bytes(op.require_swap); + unpack_settings.apply(); + apply_settings = false; + } + + switch (dim) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data()); + break; + case rsx::texture_dimension_extended::texture_dimension_2d: + glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); + break; + case rsx::texture_dimension_extended::texture_dimension_cubemap: + // Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap + // Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0. + // mip_level % mipmap_per_layer will always be equal to mip_level + glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data()); + mip_level++; + break; + case rsx::texture_dimension_extended::texture_dimension_3d: + glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data()); + break; + default: + ASSUME(0); + fmt::throw_exception("Unreachable" HERE); } } - return; } } @@ -615,9 +630,6 @@ namespace gl } glBindTexture(target, id); - glPixelStorei(GL_UNPACK_ALIGNMENT, 4); - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); - glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_FALSE); glTexParameteri(target, GL_TEXTURE_BASE_LEVEL, 0); glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, mipmaps - 1); // The rest of sampler state is now handled by sampler state objects @@ -627,6 +639,7 @@ namespace gl size_t texture_data_sz = depth * height * aligned_pitch; std::vector data_upload_buf(texture_data_sz); + // TODO: GL drivers support byteswapping and this should be used instead of doing so manually const auto format_type = get_format_type(gcm_format); const GLenum gl_format = std::get<0>(format_type); const GLenum gl_type = std::get<1>(format_type); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 82307a107f..9e5b9f82e4 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -516,19 +516,26 @@ namespace vk u32 block_in_pixel = get_format_block_size_in_texel(format); u8 block_size_in_bytes = get_format_block_size_in_bytes(format); + texture_uploader_capabilities caps{ true, false, heap_align }; + vk::buffer* scratch_buf = nullptr; + u32 scratch_offset = 0; + for (const rsx_subresource_layout &layout : subresource_layout) { u32 row_pitch = (((layout.width_in_block * block_size_in_bytes) + heap_align - 1) / heap_align) * heap_align; if (heap_align != 256) verify(HERE), row_pitch == heap_align; u32 image_linear_size = row_pitch * layout.height_in_block * layout.depth; - //Map with extra padding bytes in case of realignment + // Map with extra padding bytes in case of realignment size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8); void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8); VkBuffer buffer_handle = upload_heap.heap->value; + // Only do GPU-side conversion if occupancy is good + caps.supports_byteswap = (image_linear_size >= 1024); + gsl::span mapped{ (gsl::byte*)mapped_buffer, ::narrow(image_linear_size) }; - upload_texture_subresource(mapped, layout, format, is_swizzled, false, heap_align); + auto opt = upload_texture_subresource(mapped, layout, format, is_swizzled, caps); upload_heap.unmap(); VkBufferImageCopy copy_info = {}; @@ -542,25 +549,61 @@ namespace vk copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count; copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes; - if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT || - dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT) + if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) { - // Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead - auto scratch_buf = vk::get_scratch_buffer(); + if (!scratch_buf) + { + scratch_buf = vk::get_scratch_buffer(); + } + else if ((scratch_offset + image_linear_size) > scratch_buf->size()) + { + scratch_offset = 0; + insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_buf->size(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); + } VkBufferCopy copy = {}; copy.srcOffset = offset_in_buffer; - copy.dstOffset = 0; + copy.dstOffset = scratch_offset; copy.size = image_linear_size; vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, ©); - insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + insert_buffer_memory_barrier(cmd, scratch_buf->value, scratch_offset, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); + } - copy_info.bufferOffset = 0; + if (opt.require_swap) + { + if (opt.element_size == 4) + { + vk::get_compute_task()->run(cmd, scratch_buf, image_linear_size, scratch_offset); + } + else if (opt.element_size == 2) + { + vk::get_compute_task()->run(cmd, scratch_buf, image_linear_size, scratch_offset); + } + else + { + fmt::throw_exception("Unreachable" HERE); + } + } + + if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + { + copy_info.bufferOffset = scratch_offset; + scratch_offset = align(scratch_offset + image_linear_size, 512); vk::copy_buffer_to_image(cmd, scratch_buf, dst_image, copy_info); } + else if (opt.require_swap) + { + insert_buffer_memory_barrier(cmd, scratch_buf->value, scratch_offset, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + + copy_info.bufferOffset = scratch_offset; + scratch_offset = align(scratch_offset + image_linear_size, 512); + vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info); + } else { vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_info);