Implement v128::fma32f

This commit is contained in:
Nekotekina 2020-06-05 17:51:34 +03:00
parent ebbf329b6a
commit 3b8e7d0967
3 changed files with 37 additions and 12 deletions

View file

@ -3,6 +3,7 @@
#include "types.h"
#include "util/endian.hpp"
#include <cstring>
#include <cmath>
#if __has_include(<bit>)
#include <bit>
@ -322,6 +323,36 @@ union alignas(16) v128
return fromD(_mm_cmpeq_pd(left.vd, right.vd));
}
static inline bool use_fma = false;
static inline v128 fma32f(v128 a, const v128& b, const v128& c)
{
#ifndef __FMA__
if (use_fma) [[likely]]
{
#ifdef _MSC_VER
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#else
__asm__("vfmadd213ps %[c], %[b], %[a]"
: [a] "+x" (a.vf)
: [b] "x" (b.vf)
, [c] "x" (c.vf));
return a;
#endif
}
for (int i = 0; i < 4; i++)
{
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
}
return a;
#else
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
return a;
#endif
}
bool operator==(const v128& right) const
{
return _u64[0] == right._u64[0] && _u64[1] == right._u64[1];

View file

@ -959,9 +959,9 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vc].vf;
const auto c = ppu.vr[op.vb].vf;
const auto result = _mm_add_ps(_mm_mul_ps(a, b), c);
const auto b = ppu.vr[op.vb].vf;
const auto c = ppu.vr[op.vc].vf;
const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
ppu.vr[op.vd] = vec_handle_nan(result);
return true;
}
@ -971,15 +971,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto c = ppu.vr[op.vc];
v128 d;
// TODO: Optimize
for (u32 i = 0; i < 4; i++)
{
d._f[i] = f32(f64{a._f[i]} * f64{c._f[i]} + f64{b._f[i]});
}
ppu.vr[op.rd] = vec_handle_nan(d, a, b, c);
ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c);
return true;
}

View file

@ -268,6 +268,8 @@ int main(int argc, char** argv)
const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull;
#endif
v128::use_fma = utils::has_fma3();
s_argv0 = argv[0]; // Save for report_fatal_error
// Only run RPCS3 to display an error