mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-20 11:36:13 +00:00
Implement v128::fma32f
This commit is contained in:
parent
ebbf329b6a
commit
3b8e7d0967
3 changed files with 37 additions and 12 deletions
|
@ -3,6 +3,7 @@
|
|||
#include "types.h"
|
||||
#include "util/endian.hpp"
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#if __has_include(<bit>)
|
||||
#include <bit>
|
||||
|
@ -322,6 +323,36 @@ union alignas(16) v128
|
|||
return fromD(_mm_cmpeq_pd(left.vd, right.vd));
|
||||
}
|
||||
|
||||
static inline bool use_fma = false;
|
||||
|
||||
static inline v128 fma32f(v128 a, const v128& b, const v128& c)
|
||||
{
|
||||
#ifndef __FMA__
|
||||
if (use_fma) [[likely]]
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#else
|
||||
__asm__("vfmadd213ps %[c], %[b], %[a]"
|
||||
: [a] "+x" (a.vf)
|
||||
: [b] "x" (b.vf)
|
||||
, [c] "x" (c.vf));
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
|
||||
}
|
||||
return a;
|
||||
#else
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool operator==(const v128& right) const
|
||||
{
|
||||
return _u64[0] == right._u64[0] && _u64[1] == right._u64[1];
|
||||
|
|
|
@ -959,9 +959,9 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
|
|||
bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
|
||||
{
|
||||
const auto a = ppu.vr[op.va].vf;
|
||||
const auto b = ppu.vr[op.vc].vf;
|
||||
const auto c = ppu.vr[op.vb].vf;
|
||||
const auto result = _mm_add_ps(_mm_mul_ps(a, b), c);
|
||||
const auto b = ppu.vr[op.vb].vf;
|
||||
const auto c = ppu.vr[op.vc].vf;
|
||||
const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
|
||||
ppu.vr[op.vd] = vec_handle_nan(result);
|
||||
return true;
|
||||
}
|
||||
|
@ -971,15 +971,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
|
|||
const auto a = ppu.vr[op.va];
|
||||
const auto b = ppu.vr[op.vb];
|
||||
const auto c = ppu.vr[op.vc];
|
||||
v128 d;
|
||||
|
||||
// TODO: Optimize
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
{
|
||||
d._f[i] = f32(f64{a._f[i]} * f64{c._f[i]} + f64{b._f[i]});
|
||||
}
|
||||
|
||||
ppu.vr[op.rd] = vec_handle_nan(d, a, b, c);
|
||||
ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -268,6 +268,8 @@ int main(int argc, char** argv)
|
|||
const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull;
|
||||
#endif
|
||||
|
||||
v128::use_fma = utils::has_fma3();
|
||||
|
||||
s_argv0 = argv[0]; // Save for report_fatal_error
|
||||
|
||||
// Only run RPCS3 to display an error
|
||||
|
|
Loading…
Add table
Reference in a new issue