diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 5d391a7b46..df77c36aa6 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -276,4 +276,16 @@ void write_index_array_data_to_buffer(char* dst, unsigned m_draw_mode, unsigned return; } } +} + +void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w) noexcept +{ + __m128i vector = _mm_set_epi32(w, z, y, x); + _mm_stream_si128((__m128i*)dst, vector); +} + +void stream_vector_from_memory(void *dst, void *src) noexcept +{ + const __m128i &vector = _mm_loadu_si128((__m128i*)src); + _mm_stream_si128((__m128i*)dst, vector); } \ No newline at end of file diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index 106825b3ad..05186ab7b2 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -42,4 +42,14 @@ void write_index_array_data_to_buffer(char* dst, unsigned m_draw_mode, unsigned /** * Write index data needed to emulate non indexed non native primitive mode. */ -void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst, unsigned m_draw_mode, unsigned first, unsigned count) noexcept; \ No newline at end of file +void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst, unsigned m_draw_mode, unsigned first, unsigned count) noexcept; + +/** + * Stream a 128 bits vector to dst. + */ +void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w) noexcept; + +/** + * Stream a 128 bits vector from src to dst. + */ +void stream_vector_from_memory(void *dst, void *src) noexcept; diff --git a/rpcs3/Emu/RSX/Common/ProgramStateCache.h b/rpcs3/Emu/RSX/Common/ProgramStateCache.h index 7182796b5c..cfb07ff44d 100644 --- a/rpcs3/Emu/RSX/Common/ProgramStateCache.h +++ b/rpcs3/Emu/RSX/Common/ProgramStateCache.h @@ -337,6 +337,36 @@ public: return result; } + size_t get_fragment_constants_buffer_size(const RSXFragmentProgram *fragmentShader) const noexcept + { + typename binary2FS::const_iterator It = m_cacheFS.find(vm::base(fragmentShader->addr)); + if (It != m_cacheFS.end()) + return It->second.FragmentConstantOffsetCache.size() * 4 * sizeof(float); + LOG_ERROR(RSX, "Can't retrieve constant offset cache"); + return 0; + } + + void fill_fragment_constans_buffer(void *buffer, const RSXFragmentProgram *fragment_program) const noexcept + { + typename binary2FS::const_iterator It = m_cacheFS.find(vm::base(fragment_program->addr)); + if (It == m_cacheFS.end()) + return; + __m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + size_t offset = 0; + for (size_t offset_in_fragment_program : It->second.FragmentConstantOffsetCache) + { + void *data = vm::base(fragment_program->addr + (u32)offset_in_fragment_program); + const __m128i &vector = _mm_loadu_si128((__m128i*)data); + const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_stream_si128((__m128i*)((char*)buffer + offset), shuffled_vector); + offset += 4 * sizeof(u32); + } + } + const std::vector &getFragmentConstantOffsetsCache(const RSXFragmentProgram *fragmentShader) const { typename binary2FS::const_iterator It = m_cacheFS.find(vm::base(fragmentShader->addr)); diff --git a/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp b/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp index 1313df4f1f..09a0a685d6 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp +++ b/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp @@ -142,30 +142,6 @@ void D3D12GSRender::load_vertex_index_data(u32 first, u32 count) void D3D12GSRender::upload_and_bind_scale_offset_matrix(size_t descriptorIndex) { - float scale_offset_matrix[16] = - { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, -1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f - }; - - int clip_w = rsx::method_registers[NV4097_SET_SURFACE_CLIP_HORIZONTAL] >> 16; - int clip_h = rsx::method_registers[NV4097_SET_SURFACE_CLIP_VERTICAL] >> 16; - - // Scale - scale_offset_matrix[0] *= (float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE] / (clip_w / 2.f); - scale_offset_matrix[5] *= (float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE + 1] / (clip_h / 2.f); - scale_offset_matrix[10] = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE + 2]; - - // Offset - scale_offset_matrix[3] = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET] - (clip_w / 2.f); - scale_offset_matrix[7] = -((float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET + 1] - (clip_h / 2.f)); - scale_offset_matrix[11] = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET + 2]; - - scale_offset_matrix[3] /= clip_w / 2.f; - scale_offset_matrix[7] /= clip_h / 2.f; - assert(m_constantsData.can_alloc(256)); size_t heap_offset = m_constantsData.alloc(256); @@ -173,7 +149,7 @@ void D3D12GSRender::upload_and_bind_scale_offset_matrix(size_t descriptorIndex) // Separate constant buffer void *mapped_buffer; ThrowIfFailed(m_constantsData.m_heap->Map(0, &CD3DX12_RANGE(heap_offset, heap_offset + 256), &mapped_buffer)); - streamToBuffer((char*)mapped_buffer + heap_offset, scale_offset_matrix, 16 * sizeof(float)); + fill_scale_offset_data((char*)mapped_buffer + heap_offset); int is_alpha_tested = !!(rsx::method_registers[NV4097_SET_ALPHA_TEST_ENABLE]); float alpha_ref = (float&)rsx::method_registers[NV4097_SET_ALPHA_REF]; memcpy((char*)mapped_buffer + heap_offset + 16 * sizeof(float), &is_alpha_tested, sizeof(int)); @@ -191,9 +167,6 @@ void D3D12GSRender::upload_and_bind_scale_offset_matrix(size_t descriptorIndex) void D3D12GSRender::upload_and_bind_vertex_shader_constants(size_t descriptor_index) { - for (const auto &entry : transform_constants) - local_transform_constants[entry.first] = entry.second; - size_t buffer_size = 512 * 4 * sizeof(float); assert(m_constantsData.can_alloc(buffer_size)); @@ -201,16 +174,7 @@ void D3D12GSRender::upload_and_bind_vertex_shader_constants(size_t descriptor_in void *mapped_buffer; ThrowIfFailed(m_constantsData.m_heap->Map(0, &CD3DX12_RANGE(heap_offset, heap_offset + buffer_size), &mapped_buffer)); - for (const auto &entry : local_transform_constants) - { - float data[4] = { - entry.second.x, - entry.second.y, - entry.second.z, - entry.second.w - }; - streamToBuffer((char*)mapped_buffer + heap_offset + entry.first * 4 * sizeof(float), data, 4 * sizeof(float)); - } + fill_vertex_program_constants_data((char*)mapped_buffer + heap_offset); m_constantsData.m_heap->Unmap(0, &CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); D3D12_CONSTANT_BUFFER_VIEW_DESC constant_buffer_view_desc = { @@ -225,8 +189,7 @@ void D3D12GSRender::upload_and_bind_vertex_shader_constants(size_t descriptor_in void D3D12GSRender::upload_and_bind_fragment_shader_constants(size_t descriptor_index) { // Get constant from fragment program - const std::vector &fragment_constant_offsets = m_cachePSO.getFragmentConstantOffsetsCache(&fragment_program); - size_t buffer_size = fragment_constant_offsets.size() * 4 * sizeof(float) + 1; + size_t buffer_size = m_cachePSO.get_fragment_constants_buffer_size(&fragment_program); // Multiple of 256 never 0 buffer_size = (buffer_size + 255) & ~255; @@ -236,24 +199,7 @@ void D3D12GSRender::upload_and_bind_fragment_shader_constants(size_t descriptor_ size_t offset = 0; void *mapped_buffer; ThrowIfFailed(m_constantsData.m_heap->Map(0, &CD3DX12_RANGE(heap_offset, heap_offset + buffer_size), &mapped_buffer)); - for (size_t offset_in_fragment_program : fragment_constant_offsets) - { - u32 vector[4]; - auto data = vm::ps3::ptr::make(fragment_program.addr + (u32)offset_in_fragment_program); - - u32 c0 = (data[0] >> 16 | data[0] << 16); - u32 c1 = (data[1] >> 16 | data[1] << 16); - u32 c2 = (data[2] >> 16 | data[2] << 16); - u32 c3 = (data[3] >> 16 | data[3] << 16); - - vector[0] = c0; - vector[1] = c1; - vector[2] = c2; - vector[3] = c3; - - streamToBuffer((char*)mapped_buffer + heap_offset + offset, vector, 4 * sizeof(u32)); - offset += 4 * sizeof(u32); - } + m_cachePSO.fill_fragment_constans_buffer((char*)mapped_buffer + heap_offset, &fragment_program); m_constantsData.m_heap->Unmap(0, &CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); D3D12_CONSTANT_BUFFER_VIEW_DESC constant_buffer_view_desc = { diff --git a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp index a8bfbaf469..62d60fe461 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp +++ b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.cpp @@ -672,7 +672,6 @@ void D3D12GSRender::flip(int buffer) storage.uav_heap_get_pos = m_UAVHeap.get_current_put_pos_minus_one(); // Flush - local_transform_constants.clear(); m_texturesRTTs.clear(); // Now get ready for next frame diff --git a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h index a8cf6a0ec8..d316a98ff3 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h +++ b/rpcs3/Emu/RSX/D3D12/D3D12GSRender.h @@ -72,7 +72,6 @@ private: RSXFragmentProgram fragment_program; PipelineStateObjectCache m_cachePSO; std::tuple, size_t> *m_PSO; - std::unordered_map local_transform_constants; struct { diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index e6f765d9d5..906e5e09eb 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -586,6 +586,7 @@ namespace rsx std::this_thread::sleep_for(std::chrono::milliseconds((s64)(1000.0 / limit - rsx->timer_sync.GetElapsedTimeInMilliSec()))); rsx->timer_sync.Start(); + rsx->local_transform_constants.clear(); } void user_command(thread* rsx, u32 arg) @@ -1024,6 +1025,42 @@ namespace rsx onexit_thread(); } + void thread::fill_scale_offset_data(void *buffer) const noexcept + { + int clip_w = rsx::method_registers[NV4097_SET_SURFACE_CLIP_HORIZONTAL] >> 16; + int clip_h = rsx::method_registers[NV4097_SET_SURFACE_CLIP_VERTICAL] >> 16; + + float scale_x = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE] / (clip_w / 2.f); + float offset_x = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET] - (clip_w / 2.f); + offset_x /= clip_w / 2.f; + + float scale_y = -(float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE + 1] / (clip_h / 2.f); + float offset_y = -((float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET + 1] - (clip_h / 2.f)); + offset_y /= clip_h / 2.f; + + float scale_z = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_SCALE + 2]; + float offset_z = (float&)rsx::method_registers[NV4097_SET_VIEWPORT_OFFSET + 2]; + + float one = 1.f; + + stream_vector(buffer, (u32&)scale_x, 0, 0, (u32&)offset_x); + stream_vector((char*)buffer + 16, 0, (u32&)scale_y, 0, (u32&)offset_y); + stream_vector((char*)buffer + 32, 0, 0, (u32&)scale_z, (u32&)offset_z); + stream_vector((char*)buffer + 48, 0, 0, 0, (u32&)one); + } + + /** + * Fill buffer with vertex program constants. + * Buffer must be at least 512 float4 wide. + */ + void thread::fill_vertex_program_constants_data(void *buffer) noexcept + { + for (const auto &entry : transform_constants) + local_transform_constants[entry.first] = entry.second; + for (const auto &entry : local_transform_constants) + stream_vector_from_memory((char*)buffer + entry.first * 4 * sizeof(float), (void*)entry.second.rgba); + } + u64 thread::timestamp() const { // Get timestamp, and convert it from microseconds to nanoseconds diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 2131b26d82..4e1d8aa824 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -161,6 +161,9 @@ namespace rsx std::unordered_map> transform_constants; + // Constant stored for whole frame + std::unordered_map local_transform_constants; + u32 transform_program[512 * 4] = {}; virtual void load_vertex_data(u32 first, u32 count); @@ -218,6 +221,18 @@ namespace rsx void task(); + /** + * Fill buffer with 4x4 scale offset matrix. + * Vertex shader's position is to be multiplied by this matrix. + */ + void fill_scale_offset_data(void *buffer) const noexcept; + + /** + * Fill buffer with vertex program constants. + * Buffer must be at least 512 float4 wide. + */ + void fill_vertex_program_constants_data(void *buffer) noexcept; + public: void reset(); void init(const u32 ioAddress, const u32 ioSize, const u32 ctrlAddress, const u32 localAddress);