diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index f5a95095ab..5d799b3421 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -139,17 +139,105 @@ namespace if (remaining) { - auto src_ptr2 = reinterpret_cast(src_ptr); - auto dst_ptr2 = reinterpret_cast(dst_ptr); + const auto src_ptr2 = reinterpret_cast*>(src_ptr); + const auto dst_ptr2 = reinterpret_cast*>(dst_ptr); for (u32 i = 0; i < remaining; ++i) - dst_ptr2[i] = se_storage::swap(src_ptr2[i]); + dst_ptr2[i] = src_ptr2[i]; } } template void stream_data_to_memory_swapped_u32(void *, const void *, u32, u8); template void stream_data_to_memory_swapped_u32(void*, const void*, u32, u8); + template + bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size) + { + const __m128i mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + auto dst_ptr = static_cast<__m128i*>(dst); + auto src_ptr = static_cast(src); + + const u32 dword_count = size >> 2; + const u32 iterations = dword_count >> 2; + + v128 bits_diff{}; + + if (s_use_ssse3) [[likely]] + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vector = _mm_loadu_si128(src_ptr); + const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); + + if constexpr (!unaligned) + { + bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector)); + _mm_stream_si128(dst_ptr, shuffled_vector); + } + else + { + bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector)); + _mm_storeu_si128(dst_ptr, shuffled_vector); + } + + src_ptr++; + dst_ptr++; + } + } + else + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vec0 = _mm_loadu_si128(src_ptr); + const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); + const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); + + if constexpr (!unaligned) + { + bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), vec2)); + _mm_stream_si128(dst_ptr, vec2); + } + else + { + bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2)); + _mm_storeu_si128(dst_ptr, vec2); + } + + src_ptr++; + dst_ptr++; + } + } + + const u32 remaining = dword_count % 4; + + if (remaining) + { + const auto src_ptr2 = reinterpret_cast*>(src_ptr); + const auto dst_ptr2 = reinterpret_cast*>(dst_ptr); + + for (u32 i = 0; i < remaining; ++i) + { + const u32 data = src_ptr2[i]; + + if (dst_ptr2[i] != data) + { + dst_ptr2[i] = data; + bits_diff._u32[0] = UINT32_MAX; + } + } + } + + return bits_diff != v128{}; + } + + template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); + template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); + namespace { inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride) @@ -194,11 +282,11 @@ namespace if (remaining) { - auto src_ptr2 = reinterpret_cast(src_ptr); - auto dst_ptr2 = reinterpret_cast(dst_ptr); + auto src_ptr2 = reinterpret_cast*>(src_ptr); + auto dst_ptr2 = reinterpret_cast*>(dst_ptr); for (u32 i = 0; i < remaining; ++i) - dst_ptr2[i] = se_storage::swap(src_ptr2[i]); + dst_ptr2[i] = src_ptr2[i]; } } diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index a6b56f711e..175a36f84d 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -61,5 +61,7 @@ void stream_vector_from_memory(void *dst, void *src); */ template void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride); +template +bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 4eae1603bb..8bfdbe2d23 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -422,8 +422,6 @@ namespace rsx rcount -= max - (468 * 4); } - alignas(64) u8 buffer[128]; - const auto values = &rsx::method_registers.transform_constants[load + reg][subreg]; if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty) @@ -433,12 +431,9 @@ namespace rsx } else { - stream_data_to_memory_swapped_u32(buffer, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount, 4); - - if (std::memcmp(values, buffer, rcount * 4) != 0) + if (stream_data_to_memory_swapped_and_compare_u32(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount * 4)) { // Transform constants invalidation is expensive (~8k bytes per update) - std::memcpy(values, buffer, rcount * 4); rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty; } }