diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index d22ed371..51e6a43d 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -91,6 +91,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator { void recCMP(const PICAShader& shader, u32 instruction); void recDP3(const PICAShader& shader, u32 instruction); void recDP4(const PICAShader& shader, u32 instruction); + void recDPH(const PICAShader& shader, u32 instruction); void recEMIT(const PICAShader& shader, u32 instruction); void recEND(const PICAShader& shader, u32 instruction); void recEX2(const PICAShader& shader, u32 instruction); diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 13eb630e..523c41db 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -143,6 +143,7 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { break; case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break; case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break; + case ShaderOpcodes::DPH: recDPH(shaderUnit, instruction); break; case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break; case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break; @@ -525,6 +526,30 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { storeRegister(src1_xmm, shader, dest, operandDescriptor); } +void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) + loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); + + // Attach 1.0 to the w component of src1 + if (haveSSE4_1) { + blendps(src1_xmm, xword[rip + onesVector], 0b1000); + } else { + movaps(scratch1, src1_xmm); + unpckhps(scratch1, xword[rip + onesVector]); + unpcklpd(src1_xmm, scratch1); + } + + dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + storeRegister(src1_xmm, shader, dest, operandDescriptor); +} + void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction);