mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-19 19:15:26 +00:00
RSX: remove SSSE3 dependency
This commit is contained in:
parent
de5dab35e0
commit
61de20a633
3 changed files with 103 additions and 38 deletions
|
@ -470,7 +470,7 @@ struct se_storage<T, 16, 16>
|
|||
|
||||
static inline v128 swap(const v128& src)
|
||||
{
|
||||
return v128::fromV(_mm_shuffle_epi8(src.vi, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)));
|
||||
return v128::from64(se_storage<u64>::swap(src._u64[1]), se_storage<u64>::swap(src._u64[0]));
|
||||
}
|
||||
|
||||
static inline v128 to(const T& src)
|
||||
|
|
|
@ -1,9 +1,20 @@
|
|||
#include "stdafx.h"
|
||||
#include "BufferUtils.h"
|
||||
#include "../rsx_methods.h"
|
||||
#include "Utilities/sysinfo.h"
|
||||
|
||||
#define DEBUG_VERTEX_STREAMING 0
|
||||
|
||||
const bool s_use_ssse3 =
|
||||
#ifdef _MSC_VER
|
||||
utils::has_ssse3();
|
||||
#elif __SSSE3__
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#define _mm_shuffle_epi8
|
||||
#endif
|
||||
|
||||
namespace
|
||||
{
|
||||
// FIXME: GSL as_span break build if template parameter is non const with current revision.
|
||||
|
@ -49,16 +60,30 @@ namespace
|
|||
const u32 iterations = dword_count >> 2;
|
||||
const u32 remaining = dword_count % 4;
|
||||
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
if (LIKELY(s_use_ssse3))
|
||||
{
|
||||
u32 *src_words = (u32*)src_ptr;
|
||||
u32 *dst_words = (u32*)dst_ptr;
|
||||
const __m128i &vector = _mm_loadu_si128(src_ptr);
|
||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||
_mm_stream_si128(dst_ptr, vec2);
|
||||
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
if (remaining)
|
||||
|
@ -86,16 +111,29 @@ namespace
|
|||
const u32 iterations = word_count >> 3;
|
||||
const u32 remaining = word_count % 8;
|
||||
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
if (LIKELY(s_use_ssse3))
|
||||
{
|
||||
u32 *src_words = (u32*)src_ptr;
|
||||
u32 *dst_words = (u32*)dst_ptr;
|
||||
const __m128i &vector = _mm_loadu_si128(src_ptr);
|
||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||
_mm_stream_si128(dst_ptr, vec1);
|
||||
|
||||
src_ptr++;
|
||||
dst_ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
if (remaining)
|
||||
|
@ -133,14 +171,30 @@ namespace
|
|||
else
|
||||
remainder = vertex_count;
|
||||
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
if (LIKELY(s_use_ssse3))
|
||||
{
|
||||
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, vec2);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
if (remainder)
|
||||
|
@ -181,14 +235,29 @@ namespace
|
|||
else
|
||||
remainder = vertex_count;
|
||||
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
if (LIKELY(s_use_ssse3))
|
||||
{
|
||||
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (u32 i = 0; i < iterations; ++i)
|
||||
{
|
||||
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
|
||||
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||
_mm_storeu_si128((__m128i*)dst_ptr, vec1);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
if (remainder)
|
||||
|
|
|
@ -310,24 +310,20 @@ public:
|
|||
const auto I = m_fragment_shader_cache.find(fragment_program);
|
||||
if (I == m_fragment_shader_cache.end())
|
||||
return;
|
||||
__m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD,
|
||||
0xA, 0xB, 0x8, 0x9,
|
||||
0x6, 0x7, 0x4, 0x5,
|
||||
0x2, 0x3, 0x0, 0x1);
|
||||
|
||||
verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16);
|
||||
|
||||
f32* dst = dst_buffer.data();
|
||||
f32 tmp[4];
|
||||
alignas(16) f32 tmp[4];
|
||||
for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
|
||||
{
|
||||
void *data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
|
||||
const __m128i &vector = _mm_loadu_si128((__m128i*)data);
|
||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||
char* data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
|
||||
const __m128i vector = _mm_loadu_si128((__m128i*)data);
|
||||
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
|
||||
|
||||
if (!patch_table.is_empty())
|
||||
{
|
||||
_mm_storeu_ps(tmp, (__m128&)shuffled_vector);
|
||||
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
|
||||
bool patched;
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
|
|
Loading…
Add table
Reference in a new issue