RSX: remove SSSE3 dependency

This commit is contained in:
Nekotekina 2017-12-05 23:48:01 +03:00
parent de5dab35e0
commit 61de20a633
3 changed files with 103 additions and 38 deletions

View file

@ -470,7 +470,7 @@ struct se_storage<T, 16, 16>
static inline v128 swap(const v128& src)
{
return v128::fromV(_mm_shuffle_epi8(src.vi, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)));
return v128::from64(se_storage<u64>::swap(src._u64[1]), se_storage<u64>::swap(src._u64[0]));
}
static inline v128 to(const T& src)

View file

@ -1,9 +1,20 @@
#include "stdafx.h"
#include "BufferUtils.h"
#include "../rsx_methods.h"
#include "Utilities/sysinfo.h"
#define DEBUG_VERTEX_STREAMING 0
const bool s_use_ssse3 =
#ifdef _MSC_VER
utils::has_ssse3();
#elif __SSSE3__
true;
#else
false;
#define _mm_shuffle_epi8
#endif
namespace
{
// FIXME: GSL as_span break build if template parameter is non const with current revision.
@ -49,16 +60,30 @@ namespace
const u32 iterations = dword_count >> 2;
const u32 remaining = dword_count % 4;
for (u32 i = 0; i < iterations; ++i)
if (LIKELY(s_use_ssse3))
{
u32 *src_words = (u32*)src_ptr;
u32 *dst_words = (u32*)dst_ptr;
const __m128i &vector = _mm_loadu_si128(src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vector = _mm_loadu_si128(src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
src_ptr++;
dst_ptr++;
src_ptr++;
dst_ptr++;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128(src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
_mm_stream_si128(dst_ptr, vec2);
src_ptr++;
dst_ptr++;
}
}
if (remaining)
@ -86,16 +111,29 @@ namespace
const u32 iterations = word_count >> 3;
const u32 remaining = word_count % 8;
for (u32 i = 0; i < iterations; ++i)
if (LIKELY(s_use_ssse3))
{
u32 *src_words = (u32*)src_ptr;
u32 *dst_words = (u32*)dst_ptr;
const __m128i &vector = _mm_loadu_si128(src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vector = _mm_loadu_si128(src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
src_ptr++;
dst_ptr++;
src_ptr++;
dst_ptr++;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128(src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
_mm_stream_si128(dst_ptr, vec1);
src_ptr++;
dst_ptr++;
}
}
if (remaining)
@ -133,14 +171,30 @@ namespace
else
remainder = vertex_count;
for (u32 i = 0; i < iterations; ++i)
if (LIKELY(s_use_ssse3))
{
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
src_ptr += src_stride;
dst_ptr += dst_stride;
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
_mm_storeu_si128((__m128i*)dst_ptr, vec2);
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}
if (remainder)
@ -181,14 +235,29 @@ namespace
else
remainder = vertex_count;
for (u32 i = 0; i < iterations; ++i)
if (LIKELY(s_use_ssse3))
{
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
src_ptr += src_stride;
dst_ptr += dst_stride;
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
_mm_storeu_si128((__m128i*)dst_ptr, vec1);
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}
if (remainder)

View file

@ -310,24 +310,20 @@ public:
const auto I = m_fragment_shader_cache.find(fragment_program);
if (I == m_fragment_shader_cache.end())
return;
__m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD,
0xA, 0xB, 0x8, 0x9,
0x6, 0x7, 0x4, 0x5,
0x2, 0x3, 0x0, 0x1);
verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16);
f32* dst = dst_buffer.data();
f32 tmp[4];
alignas(16) f32 tmp[4];
for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
{
void *data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
const __m128i &vector = _mm_loadu_si128((__m128i*)data);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
char* data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
const __m128i vector = _mm_loadu_si128((__m128i*)data);
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
if (!patch_table.is_empty())
{
_mm_storeu_ps(tmp, (__m128&)shuffled_vector);
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
bool patched;
for (int i = 0; i < 4; ++i)