diff --git a/rpcs3/Emu/RSX/Common/Interpreter/FragmentInterpreter.glsl b/rpcs3/Emu/RSX/Common/Interpreter/FragmentInterpreter.glsl new file mode 100644 index 0000000000..0603a21c8c --- /dev/null +++ b/rpcs3/Emu/RSX/Common/Interpreter/FragmentInterpreter.glsl @@ -0,0 +1,553 @@ +R"( +layout(location=0) out vec4 ocol0; +layout(location=1) out vec4 ocol1; +layout(location=2) out vec4 ocol2; +layout(location=3) out vec4 ocol3; + +#define RSX_FP_OPCODE_NOP 0x00 // No-Operation +#define RSX_FP_OPCODE_MOV 0x01 // Move +#define RSX_FP_OPCODE_MUL 0x02 // Multiply +#define RSX_FP_OPCODE_ADD 0x03 // Add +#define RSX_FP_OPCODE_MAD 0x04 // Multiply-Add +#define RSX_FP_OPCODE_DP3 0x05 // 3-component Dot Product +#define RSX_FP_OPCODE_DP4 0x06 // 4-component Dot Product +#define RSX_FP_OPCODE_DST 0x07 // Distance +#define RSX_FP_OPCODE_MIN 0x08 // Minimum +#define RSX_FP_OPCODE_MAX 0x09 // Maximum +#define RSX_FP_OPCODE_SLT 0x0A // Set-If-LessThan +#define RSX_FP_OPCODE_SGE 0x0B // Set-If-GreaterEqual +#define RSX_FP_OPCODE_SLE 0x0C // Set-If-LessEqual +#define RSX_FP_OPCODE_SGT 0x0D // Set-If-GreaterThan +#define RSX_FP_OPCODE_SNE 0x0E // Set-If-NotEqual +#define RSX_FP_OPCODE_SEQ 0x0F // Set-If-Equal +#define RSX_FP_OPCODE_FRC 0x10 // Fraction (fract) +#define RSX_FP_OPCODE_FLR 0x11 // Floor +#define RSX_FP_OPCODE_KIL 0x12 // Kill fragment +#define RSX_FP_OPCODE_PK4 0x13 // Pack four signed 8-bit values +#define RSX_FP_OPCODE_UP4 0x14 // Unpack four signed 8-bit values +#define RSX_FP_OPCODE_DDX 0x15 // Partial-derivative in x (Screen space derivative w.r.t. x) +#define RSX_FP_OPCODE_DDY 0x16 // Partial-derivative in y (Screen space derivative w.r.t. y) +#define RSX_FP_OPCODE_TEX 0x17 // Texture lookup +#define RSX_FP_OPCODE_TXP 0x18 // Texture sample with projection (Projective texture lookup) +#define RSX_FP_OPCODE_TXD 0x19 // Texture sample with partial differentiation (Texture lookup with derivatives) +#define RSX_FP_OPCODE_RCP 0x1A // Reciprocal +#define RSX_FP_OPCODE_RSQ 0x1B // Reciprocal Square Root +#define RSX_FP_OPCODE_EX2 0x1C // Exponentiation base 2 +#define RSX_FP_OPCODE_LG2 0x1D // Log base 2 +#define RSX_FP_OPCODE_LIT 0x1E // Lighting coefficients +#define RSX_FP_OPCODE_LRP 0x1F // Linear Interpolation +#define RSX_FP_OPCODE_STR 0x20 // Set-If-True +#define RSX_FP_OPCODE_SFL 0x21 // Set-If-False +#define RSX_FP_OPCODE_COS 0x22 // Cosine +#define RSX_FP_OPCODE_SIN 0x23 // Sine +#define RSX_FP_OPCODE_PK2 0x24 // Pack two 16-bit floats +#define RSX_FP_OPCODE_UP2 0x25 // Unpack two 16-bit floats +#define RSX_FP_OPCODE_POW 0x26 // Power +#define RSX_FP_OPCODE_PKB 0x27 // Pack bytes +#define RSX_FP_OPCODE_UPB 0x28 // Unpack bytes +#define RSX_FP_OPCODE_PK16 0x29 // Pack 16 bits +#define RSX_FP_OPCODE_UP16 0x2A // Unpack 16 +#define RSX_FP_OPCODE_BEM 0x2B // Bump-environment map (a.k.a. 2D coordinate transform) +#define RSX_FP_OPCODE_PKG 0x2C // Pack with sRGB transformation +#define RSX_FP_OPCODE_UPG 0x2D // Unpack gamma +#define RSX_FP_OPCODE_DP2A 0x2E // 2-component dot product with scalar addition +#define RSX_FP_OPCODE_TXL 0x2F // Texture sample with explicit LOD +#define RSX_FP_OPCODE_TXB 0x31 // Texture sample with bias +#define RSX_FP_OPCODE_TEXBEM 0x33 +#define RSX_FP_OPCODE_TXPBEM 0x34 +#define RSX_FP_OPCODE_BEMLUM 0x35 +#define RSX_FP_OPCODE_REFL 0x36 // Reflection vector +#define RSX_FP_OPCODE_TIMESWTEX 0x37 +#define RSX_FP_OPCODE_DP2 0x38 // 2-component dot product +#define RSX_FP_OPCODE_NRM 0x39 // Normalize +#define RSX_FP_OPCODE_DIV 0x3A // Division +#define RSX_FP_OPCODE_DIVSQ 0x3B // Divide by Square Root +#define RSX_FP_OPCODE_LIF 0x3C // Final part of LIT +#define RSX_FP_OPCODE_FENCT 0x3D // Fence T? +#define RSX_FP_OPCODE_FENCB 0x3E // Fence B? +#define RSX_FP_OPCODE_BRK 0x40 // Break +#define RSX_FP_OPCODE_CAL 0x41 // Subroutine call +#define RSX_FP_OPCODE_IFE 0x42 // If +#define RSX_FP_OPCODE_LOOP 0x43 // Loop +#define RSX_FP_OPCODE_REP 0x44 // Repeat +#define RSX_FP_OPCODE_RET 0x45 // Return + +#define EXEC_LT 1 +#define EXEC_EQ 2 +#define EXEC_GT 4 + +#define RSX_FP_REGISTER_TYPE_TEMP 0 +#define RSX_FP_REGISTER_TYPE_INPUT 1 +#define RSX_FP_REGISTER_TYPE_CONSTANT 2 +#define RSX_FP_REGISTER_TYPE_UNKNOWN 3 + +#define CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT 0xe +#define CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS 0x40 + +#define GET_BITS(word, offset, count) bitfieldExtract(inst.words[word], offset, count) +#define TEST_BIT(word, offset) (GET_BITS(word, offset, 1) > 0) + +#define reg_mov(d, s, m) d = mix(d, s, m) + +bool shader_attribute(const in uint mask) +{ + return (shader_control & mask) != 0; +} + +vec4 _distance(const in vec4 a, const in vec4 b) +{ + // Old-school distance vector + return vec4(1., a.y * b.y, a.z, b.w); +} + +vec4 shuffle(const in vec4 value, const in uint code) +{ + switch (code) + { + case 0xE4: + return value; + case 0x24: + return value.xyzx; + case 0xA4: + return value.xyzz; + case 0x00: + return value.xxxx; + case 0x55: + return value.yyyy; + case 0xAA: + return value.zzzz; + case 0xFF: + return value.wwww; + case 0x04: + return value.xyxx; + default: + uint x = bitfieldExtract(code, 0, 2); + uint y = bitfieldExtract(code, 2, 2); + uint z = bitfieldExtract(code, 4, 2); + uint w = bitfieldExtract(code, 6, 2); + return vec4(value[x], value[y], value[z], value[w]); + } +} + +struct instruction_t +{ + uvec4 words; + uint opcode; + bool end; +}; + +const float modifier_scale[] = {1.f, 2.f, 4.f, 8.f, 1.f, 0.5f, 0.25f, 0.125f}; + +vec4 regs16[48]; +vec4 regs32[48]; +vec4 cc[2]; +int inst_length = 1; +int ip = -1; +instruction_t inst; + +vec4 read_src(const in int index) +{ + const uint type = GET_BITS(index + 1, 0, 2); + vec4 value; + + switch (type) + { + case RSX_FP_REGISTER_TYPE_TEMP: + { + const uint i = GET_BITS(index + 1, 2, 6); + if (TEST_BIT(index + 1, 8)) + { + value = regs16[i]; + } + else + { + value = regs32[i]; + } + break; + } + case RSX_FP_REGISTER_TYPE_INPUT: + { + const uint i = GET_BITS(0, 13, 4); + switch (i) + { + case 0: + // TODO: wpos + value = vec4(0.); break; + case 1: + value = gl_FrontFacing? in_regs[0] : in_regs[2]; break; + case 2: + value = gl_FrontFacing? in_regs[1] : in_regs[3]; break; + case 3: + value = fetch_fog_value(fog_mode, in_regs[4]); break; + case 14: + value = gl_FrontFacing? vec4(1.) : vec4(-1.); break; + default: + value = in_regs[i + 1]; break; + } + + break; + } + case RSX_FP_REGISTER_TYPE_CONSTANT: + { + inst_length = 2; + uvec4 result = + ((fp_instructions[ip + 1] << 8) & uvec4(0xFF00FF00)) | + ((fp_instructions[ip + 1] >> 8) & uvec4(0x00FF00FF)); + value = uintBitsToFloat(result); + break; + } + } + + value = shuffle(value, GET_BITS(index + 1, 9, 8)); + + // abs + if (index == 0) + { + value = (TEST_BIT(1, 29))? abs(value) : value; + } + else + { + value = (TEST_BIT(index + 1, 18))? abs(value) : value; + } + + // neg + return (TEST_BIT(index + 1, 17))? -value : value; +} + +vec4 read_cond() +{ + return shuffle(cc[GET_BITS(1, 31, 1)], GET_BITS(1, 21, 8)); +} + +vec4 _texture(in vec4 coord, float bias) +{ + const uint tex_num = GET_BITS(0, 17, 4); + if (!IS_TEXTURE_RESIDENT(tex_num)) + { + return vec4(0., 0., 0., 1.); + } + + const uint type = bitfieldExtract(texture_control, int(tex_num + tex_num), 2); + coord.xy *= texture_parameters[tex_num].scale; + + switch (type) + { + case 0: + return texture(SAMPLER1D(tex_num), coord.x, bias); + case 1: + return texture(SAMPLER2D(tex_num), coord.xy, bias); + case 2: + return texture(SAMPLER3D(tex_num), coord.xyz, bias); + case 3: + return texture(SAMPLERCUBE(tex_num), coord.xyz, bias); + } + + return vec4(0.); +} + +vec4 _textureLod(in vec4 coord, float lod) +{ + const uint tex_num = GET_BITS(0, 17, 4); + if (!IS_TEXTURE_RESIDENT(tex_num)) + { + return vec4(0., 0., 0., 1.); + } + + const uint type = bitfieldExtract(texture_control, int(tex_num + tex_num), 2); + coord.xy *= texture_parameters[tex_num].scale; + + switch (type) + { + case 0: + return textureLod(SAMPLER1D(tex_num), coord.x, lod); + case 1: + return textureLod(SAMPLER2D(tex_num), coord.xy, lod); + case 2: + return textureLod(SAMPLER3D(tex_num), coord.xyz, lod); + case 3: + return textureLod(SAMPLERCUBE(tex_num), coord.xyz, lod); + } + + return vec4(0.); +} + +void write_dst(in vec4 value) +{ + bvec4 inst_mask = bvec4( + TEST_BIT(0, 9), + TEST_BIT(0, 10), + TEST_BIT(0, 11), + TEST_BIT(0, 12)); + + if (TEST_BIT(0, 8)) // SET COND + { + uint index = GET_BITS(1, 30, 1); + reg_mov(cc[index], value, inst_mask); + } + + if (TEST_BIT(0, 30)) // NO DEST + { + return; + } + + if (TEST_BIT(0, 31)) // SAT + { + value = clamp(value, 0, 1); + } + + const uint exec_mask = GET_BITS(1, 18, 3); + if (exec_mask != 0x7) + { + bvec4 write_mask; + const vec4 cond = read_cond(); + + switch (exec_mask) + { + case 0: + return; + case EXEC_GT | EXEC_EQ: + write_mask = greaterThanEqual(cond, vec4(0.)); break; + case EXEC_LT | EXEC_EQ: + write_mask = lessThanEqual(cond, vec4(0.)); break; + case EXEC_LT | EXEC_GT: + write_mask = notEqual(cond, vec4(0.)); break; + case EXEC_GT: + write_mask = greaterThan(cond, vec4(0.)); break; + case EXEC_LT: + write_mask = lessThan(cond, vec4(0.)); break; + case EXEC_EQ: + write_mask = equal(cond, vec4(0.)); break; + } + + inst_mask = bvec4(uvec4(inst_mask) & uvec4(write_mask)); + } + + const uint scale = GET_BITS(2, 28, 3); + value *= modifier_scale[scale]; + + const uint index = GET_BITS(0, 1, 6); + if (TEST_BIT(0, 7)) + { + reg_mov(regs16[index], value, inst_mask); + } + else + { + reg_mov(regs32[index], value, inst_mask); + } +} + +void initialize() +{ + // Initialize registers + // NOTE: Register count is the number of 'full' registers that will be consumed. Hardware seems to do some renaming. + // NOTE: Attempting to zero-initialize all the registers will slow things to a crawl! + + uint register_count = bitfieldExtract(shader_control, 24, 6); + uint i = 0, j = 0; + while (register_count > 0) + { + regs32[i++] = vec4(0.); + regs16[j++] = vec4(0.); + regs16[j++] = vec4(0.); + register_count--; + } +} + +void main() +{ + initialize(); + + vec4 value, s0, s1, s2; + inst.end = false; + bool handled; + + while (!inst.end) + { + ip += inst_length; + inst_length = 1; + + // Decode instruction + // endian swap + word swap + inst.words = + ((fp_instructions[ip] << 8) & uvec4(0xFF00FF00)) | + ((fp_instructions[ip] >> 8) & uvec4(0x00FF00FF)); + + inst.opcode = GET_BITS(0, 24, 6); + inst.end = TEST_BIT(0, 0); + + // Class 1, no input/output + switch (inst.opcode) + { + case RSX_FP_OPCODE_NOP: + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + continue; + case RSX_FP_OPCODE_KIL: + discard; return; + } + + // Class 2, 1 input + s0 = read_src(0); + handled = true; + switch (inst.opcode) + { + case RSX_FP_OPCODE_MOV: + value = s0; break; + case RSX_FP_OPCODE_FRC: + value = fract(s0); break; + case RSX_FP_OPCODE_FLR: + value = floor(s0); break; + case RSX_FP_OPCODE_DDX: + value = dFdx(s0); break; + case RSX_FP_OPCODE_DDY: + value = dFdy(s0); break; + case RSX_FP_OPCODE_RCP: + value = (1.f / s0.xxxx); break; + case RSX_FP_OPCODE_RSQ: + value = inversesqrt(s0.xxxx); break; + case RSX_FP_OPCODE_EX2: + value = exp2(s0.xxxx); break; + case RSX_FP_OPCODE_LG2: + value = log2(s0.xxxx); break; + case RSX_FP_OPCODE_STR: + value = vec4(1.); break; + case RSX_FP_OPCODE_SFL: + value = vec4(0.); break; + case RSX_FP_OPCODE_COS: + value = cos(s0.xxxx); break; + case RSX_FP_OPCODE_SIN: + value = sin(s0.xxxx); break; + case RSX_FP_OPCODE_NRM: + value.xyz = normalize(s0.xyz); break; + case RSX_FP_OPCODE_TEX: + value = _texture(s0, 0.f); break; + default: + handled = false; + } + + if (!handled) + { + // Class 3, 2 inputs + s1 = read_src(1); + handled = true; + switch (inst.opcode) + { + case RSX_FP_OPCODE_MUL: + value = s0 * s1; break; + case RSX_FP_OPCODE_ADD: + value = s0 + s1; break; + case RSX_FP_OPCODE_DP2: + value = dot(s0.xy, s1.xy).xxxx; break; + case RSX_FP_OPCODE_DP3: + value = dot(s0.xyz, s1.xyz).xxxx; break; + case RSX_FP_OPCODE_DP4: + value = dot(s0, s1).xxxx; break; + case RSX_FP_OPCODE_DST: + value = _distance(s0, s1); break; + case RSX_FP_OPCODE_MIN: + value = min(s0, s1); break; + case RSX_FP_OPCODE_MAX: + value = max(s0, s1); break; + case RSX_FP_OPCODE_SLT: + value = vec4(lessThan(s0, s1)); break; + case RSX_FP_OPCODE_SGE: + value = vec4(greaterThanEqual(s0, s1)); break; + case RSX_FP_OPCODE_SLE: + value = vec4(lessThanEqual(s0, s1)); break; + case RSX_FP_OPCODE_SGT: + value = vec4(greaterThan(s0, s1)); break; + case RSX_FP_OPCODE_SNE: + value = vec4(notEqual(s0, s1)); break; + case RSX_FP_OPCODE_SEQ: + value = vec4(equal(s0, s1)); break; + case RSX_FP_OPCODE_POW: + value = pow(s0, s1).xxxx; break; + case RSX_FP_OPCODE_DIV: + value = s0 / s1.xxxx; + case RSX_FP_OPCODE_DIVSQ: + value = s0 * inversesqrt(s1.xxxx); break; + //case RSX_FP_OPCODE_TXP: + //case RSX_FP_OPCODE_TXD: + case RSX_FP_OPCODE_TXL: + value = _textureLod(s0, s1.x); break; + case RSX_FP_OPCODE_TXB: + value = _texture(s0, s1.x); break; + //case RSX_FP_OPCODE_TEXBEM: + //case RSX_FP_OPCODE_TXPBEM: + default: + handled = false; + } + } + + if (!handled) + { + // Class 4, 3 inputs + s2 = read_src(2); + switch (inst.opcode) + { + case RSX_FP_OPCODE_MAD: + value = fma(s0, s1, s2); break; + case RSX_FP_OPCODE_LRP: + value = mix(s1, s2, s0); break; + case RSX_FP_OPCODE_DP2A: + value = dot(s0.xy, s1.xy).xxxx + s2.xxxx; break; + } + } + + // Flow control +/* case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + + // Other + case RSX_FP_OPCODE_PK4: + case RSX_FP_OPCODE_UP4: + case RSX_FP_OPCODE_LIT: + case RSX_FP_OPCODE_LIF: + case RSX_FP_OPCODE_PK2: + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_UP2: + case RSX_FP_OPCODE_PKB: + case RSX_FP_OPCODE_UPB: + case RSX_FP_OPCODE_PK16: + case RSX_FP_OPCODE_UP16: + case RSX_FP_OPCODE_BEM: + case RSX_FP_OPCODE_PKG: + case RSX_FP_OPCODE_UPG: + case RSX_FP_OPCODE_BEMLUM: + case RSX_FP_OPCODE_REFL: + case RSX_FP_OPCODE_TIMESWTEX:*/ + + write_dst(value); + } + + if (!shader_attribute(CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)) + { + ocol0 = regs16[0]; + ocol1 = regs16[4]; + ocol1 = regs16[6]; + ocol1 = regs16[8]; + } + else + { + ocol0 = regs32[0]; + ocol1 = regs32[2]; + ocol1 = regs32[3]; + ocol1 = regs32[4]; + } + + if (shader_attribute(CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)) + { + gl_FragDepth = regs32[1].z; + } + else + { + gl_FragDepth = gl_FragCoord.z; + } +} + +)" diff --git a/rpcs3/Emu/RSX/Common/Interpreter/VertexInterpreter.glsl b/rpcs3/Emu/RSX/Common/Interpreter/VertexInterpreter.glsl new file mode 100644 index 0000000000..fd57d630f0 --- /dev/null +++ b/rpcs3/Emu/RSX/Common/Interpreter/VertexInterpreter.glsl @@ -0,0 +1,586 @@ +R"( + +#define RSX_SCA_OPCODE_NOP 0x00 // No-Operation +#define RSX_SCA_OPCODE_MOV 0x01 // Move (copy) +#define RSX_SCA_OPCODE_RCP 0x02 // Reciprocal +#define RSX_SCA_OPCODE_RCC 0x03 // Reciprocal clamped +#define RSX_SCA_OPCODE_RSQ 0x04 // Reciprocal square root +#define RSX_SCA_OPCODE_EXP 0x05 // Exponential base 2 (low-precision) +#define RSX_SCA_OPCODE_LOG 0x06 // Logarithm base 2 (low-precision) +#define RSX_SCA_OPCODE_LIT 0x07 // Lighting calculation +#define RSX_SCA_OPCODE_BRA 0x08 // Branch +#define RSX_SCA_OPCODE_BRI 0x09 // Branch by CC register +#define RSX_SCA_OPCODE_CAL 0x0a // Subroutine call +#define RSX_SCA_OPCODE_CLI 0x0b // Subroutine call by CC register +#define RSX_SCA_OPCODE_RET 0x0c // Return from subroutine +#define RSX_SCA_OPCODE_LG2 0x0d // Logarithm base 2 +#define RSX_SCA_OPCODE_EX2 0x0e // Exponential base 2 +#define RSX_SCA_OPCODE_SIN 0x0f // Sine function +#define RSX_SCA_OPCODE_COS 0x10 // Cosine function +#define RSX_SCA_OPCODE_BRB 0x11 // Branch by Boolean constant +#define RSX_SCA_OPCODE_CLB 0x12 // Subroutine call by Boolean constant +#define RSX_SCA_OPCODE_PSH 0x13 // Push onto stack +#define RSX_SCA_OPCODE_POP 0x14 // Pop from stack +#define RSX_VEC_OPCODE_NOP 0x00 // No-Operation +#define RSX_VEC_OPCODE_MOV 0x01 // Move +#define RSX_VEC_OPCODE_MUL 0x02 // Multiply +#define RSX_VEC_OPCODE_ADD 0x03 // Addition +#define RSX_VEC_OPCODE_MAD 0x04 // Multiply-Add +#define RSX_VEC_OPCODE_DP3 0x05 // 3-component Dot Product +#define RSX_VEC_OPCODE_DPH 0x06 // Homogeneous Dot Product +#define RSX_VEC_OPCODE_DP4 0x07 // 4-component Dot Product +#define RSX_VEC_OPCODE_DST 0x08 // Calculate distance vector +#define RSX_VEC_OPCODE_MIN 0x09 // Minimum +#define RSX_VEC_OPCODE_MAX 0x0a // Maximum +#define RSX_VEC_OPCODE_SLT 0x0b // Set-If-LessThan +#define RSX_VEC_OPCODE_SGE 0x0c // Set-If-GreaterEqual +#define RSX_VEC_OPCODE_ARL 0x0d // Load to address register (round down) +#define RSX_VEC_OPCODE_FRC 0x0e // Extract fractional part (fraction) +#define RSX_VEC_OPCODE_FLR 0x0f // Round down (floor) +#define RSX_VEC_OPCODE_SEQ 0x10 // Set-If-Equal +#define RSX_VEC_OPCODE_SFL 0x11 // Set-If-False +#define RSX_VEC_OPCODE_SGT 0x12 // Set-If-GreaterThan +#define RSX_VEC_OPCODE_SLE 0x13 // Set-If-LessEqual +#define RSX_VEC_OPCODE_SNE 0x14 // Set-If-NotEqual +#define RSX_VEC_OPCODE_STR 0x15 // Set-If-True +#define RSX_VEC_OPCODE_SSG 0x16 // Convert positive values to 1 and negative values to -1 +#define RSX_VEC_OPCODE_TXL 0x19 // Texture fetch + +#define RSX_VP_REGISTER_TYPE_TEMP 1 +#define RSX_VP_REGISTER_TYPE_INPUT 2 +#define RSX_VP_REGISTER_TYPE_CONSTANT 3 + +#define EXEC_LT 1 +#define EXEC_EQ 2 +#define EXEC_GT 4 + +#define GET_BITS bitfieldExtract +#define TEST_BIT(word, bit) (GET_BITS(word, bit, 1) != 0) + +#define reg_mov(d, s, m) d = mix(d, s, m) + +struct D0 +{ + uint addr_swz; + uvec4 swizzle; + uint cond; + bool cond_test_enable; + bool cond_update_enable_0; + uint dst_tmp; + uint addr_reg_sel_1; + uint cond_reg_sel_1; + bool saturate; + bool index_input; + bool cond_update_enable_1; + bool vec_result; +}; + +struct D1 +{ + uint input_src; + uint const_src; + uint vec_opcode; + uint sca_opcode; +}; + +struct D2 +{ + uint tex_num; +}; + +struct D3 +{ + bool end; + bool index_const; + uint dst; + uint sca_dst_tmp; + bvec4 vec_mask; + bvec4 sca_mask; +}; + +struct SRC +{ + uint reg_type; + uint tmp_src; + uvec4 swizzle; + bool neg; + bool abs; +}; + +D0 unpack_D0(const in uint packed_value) +{ + D0 result; + + result.addr_swz = GET_BITS(packed_value, 0, 2); + result.swizzle.w = GET_BITS(packed_value, 2, 2); + result.swizzle.z = GET_BITS(packed_value, 4, 2); + result.swizzle.y = GET_BITS(packed_value, 6, 2); + result.swizzle.x = GET_BITS(packed_value, 8, 2); + result.cond = GET_BITS(packed_value, 10, 3); + result.cond_test_enable = TEST_BIT(packed_value, 13); + result.cond_update_enable_0 = TEST_BIT(packed_value, 14); + result.dst_tmp = GET_BITS(packed_value, 15, 6); + result.addr_reg_sel_1 = GET_BITS(packed_value, 24, 1); + result.cond_reg_sel_1 = GET_BITS(packed_value, 25, 1); + result.saturate = TEST_BIT(packed_value, 26); + result.index_input = TEST_BIT(packed_value, 27); + result.cond_update_enable_1 = TEST_BIT(packed_value, 29); + result.vec_result = TEST_BIT(packed_value, 30); + return result; +} + +D1 unpack_D1(const in uint packed_value) +{ + D1 result; + + result.input_src = GET_BITS(packed_value, 8, 4); + result.const_src = GET_BITS(packed_value, 12, 10); + result.vec_opcode = GET_BITS(packed_value, 22, 5); + result.sca_opcode = GET_BITS(packed_value, 27, 5); + return result; +} + +D2 unpack_D2(const in uint packed_value) +{ + D2 result; + + result.tex_num = GET_BITS(packed_value, 8, 2); + return result; +} + +D3 unpack_D3(const in uint packed_value) +{ + D3 result; + + result.end = TEST_BIT(packed_value, 0); + result.index_const = TEST_BIT(packed_value, 1); + result.dst = GET_BITS(packed_value, 2, 5); + result.sca_dst_tmp = GET_BITS(packed_value, 7, 6); + result.vec_mask.w = TEST_BIT(packed_value, 13); + result.vec_mask.z = TEST_BIT(packed_value, 14); + result.vec_mask.y = TEST_BIT(packed_value, 15); + result.vec_mask.x = TEST_BIT(packed_value, 16); + result.sca_mask.w = TEST_BIT(packed_value, 17); + result.sca_mask.z = TEST_BIT(packed_value, 18); + result.sca_mask.y = TEST_BIT(packed_value, 19); + result.sca_mask.x = TEST_BIT(packed_value, 20); + return result; +} + +bool attribute_enabled(const in uint mask) +{ + return (output_mask & mask) != 0; +} + +vec4 shuffle(const in vec4 value, const in uvec4 swz) +{ + vec4 result; + result.x = ref(value, swz.x); + result.y = ref(value, swz.y); + result.z = ref(value, swz.z); + result.w = ref(value, swz.w); + return result; +} + +vec4 _distance(const in vec4 a, const in vec4 b) +{ + // Old-school distance vector + return vec4(1., a.y * b.y, a.z, b.w); +} + +// Local registers +uvec4 instr; +vec4 temp[32]; +ivec4 a[2] = { ivec4(0), ivec4(0) }; +vec4 cc[2] = { vec4(0), vec4(0) }; +vec4 dest[16]; + +D0 d0; +D1 d1; +D2 d2; +D3 d3; + +void write_sca(in float value) +{ + if (d0.saturate) + { + value = clamp(value, 0, 1); + } + + if (d3.sca_dst_tmp == 0x3f) + { + if (d3.dst != 0x1f) + { + reg_mov(dest[d3.dst], vec4(value), d3.sca_mask); + } + } + else + { + reg_mov(temp[d3.sca_dst_tmp], vec4(value), d3.sca_mask); + } +} + +void write_vec(in vec4 value) +{ + if (d0.saturate) + { + value = clamp(value, 0, 1); + } + + if (d0.dst_tmp == 0x3f && !d0.vec_result) + { + if (d0.cond_update_enable_1) + { + reg_mov(cc[d0.cond_reg_sel_1], value, d3.vec_mask); + } + } + else + { + if (d0.vec_result && d3.dst < 16) + { + reg_mov(dest[d3.dst], value, d3.vec_mask); + } + + if (d0.dst_tmp != 0x3f) + { + reg_mov(temp[d0.dst_tmp], value, d3.vec_mask); + } + } +} + +vec4 write_output(const in int oid, const in int mask_bit) +{ + if (attribute_enabled(1 << mask_bit)) + { + return dest[oid]; + } + else + { + return vec4(0., 0., 0., 1.); + } +} + +ivec4 read_addr_reg() +{ + return a[d0.addr_reg_sel_1]; +} + +int branch_addr() +{ + uint addr_h = GET_BITS(instr.z, 0, 6); + uint addr_l = GET_BITS(instr.w, 29, 3); + return int((addr_h << 3) + addr_l); +} + +bool static_branch() +{ + uint mask = (1 << GET_BITS(instr.w, 23, 5)); + bool cond = TEST_BIT(instr.w, 28); + bool actual = (transform_branch_bits & mask) != 0; + + return (cond == actual); +} + +bvec4 test_cond(vec4 cond, uint mode) +{ + switch (mode) + { + case EXEC_GT | EXEC_EQ: + return greaterThanEqual(cond, vec4(0.)); + case EXEC_LT | EXEC_EQ: + return lessThanEqual(cond, vec4(0.)); + case EXEC_LT | EXEC_GT: + return notEqual(cond, vec4(0.)); + case EXEC_GT: + return greaterThan(cond, vec4(0.)); + case EXEC_LT: + return lessThan(cond, vec4(0.)); + case EXEC_EQ: + return equal(cond, vec4(0.)); + } + + return bvec4(false); +} + +bool dynamic_branch() +{ + if (d0.cond == (EXEC_LT | EXEC_GT | EXEC_EQ)) return true; + if (d0.cond == 0) return false; + + vec4 cond = shuffle(cc[d0.cond_reg_sel_1], d0.swizzle); + return any(test_cond(cond, d0.cond)); +} + +vec4 read_src(const in int index) +{ + uint src; + vec4 value; + bool do_abs = false; + + switch (index) + { + case 0: + src = (GET_BITS(instr.y, 0, 8) << 9) | GET_BITS(instr.z, 23, 9); + do_abs = TEST_BIT(instr.x, 21); + break; + case 1: + src = GET_BITS(instr.z, 6, 17); + do_abs = TEST_BIT(instr.x, 22); + break; + case 2: + src = (GET_BITS(instr.z, 0, 6) << 11) | GET_BITS(instr.w, 21, 11); + do_abs = TEST_BIT(instr.x, 23); + break; + } + + uint reg_type = GET_BITS(src, 0, 2); + uint tmp_src = GET_BITS(src, 2, 6); + + switch (reg_type) + { + case RSX_VP_REGISTER_TYPE_TEMP: + value = temp[tmp_src]; + break; + + case RSX_VP_REGISTER_TYPE_INPUT: + value = read_location(int(d1.input_src)); + break; + + case RSX_VP_REGISTER_TYPE_CONSTANT: + if (d3.index_const) + { + value = vc[d1.const_src + ref(a[d0.addr_reg_sel_1], d0.addr_swz)]; + } + else + { + value = vc[d1.const_src]; + } + break; + } + + if (GET_BITS(src, 8, 8) != 0x1B) + { + uvec4 swz = uvec4( + GET_BITS(src, 14, 2), + GET_BITS(src, 12, 2), + GET_BITS(src, 10, 2), + GET_BITS(src, 8, 2) + ); + + value = shuffle(value, swz); + } + + if (do_abs) + { + value = abs(value); + } + + if (TEST_BIT(src, 16)) + { + value = -value; + } + + return value; +} + +void main() +{ + // Initialize output registers + for (int i = 0; i < 16; ++i) + { + dest[i] = vec4(0., 0., 0., 1.); + } + + int callstack[8]; + int stack_ptr = 0; + int current_instruction = 0; + + d3.end = false; + + while (current_instruction < 512) + { + if (d3.end) + { + break; + } + + instr = vp_instructions[current_instruction]; + current_instruction++; + + d0 = unpack_D0(instr.x); + d1 = unpack_D1(instr.y); + d2 = unpack_D2(instr.z); + d3 = unpack_D3(instr.w); + + uint vec_opcode = d1.vec_opcode; + uint sca_opcode = d1.sca_opcode; + + if (d0.cond_test_enable && d0.cond == 0) + { + vec_opcode = RSX_VEC_OPCODE_NOP; + sca_opcode = RSX_SCA_OPCODE_NOP; + } + + if (vec_opcode == RSX_VEC_OPCODE_ARL) + { + a[d0.dst_tmp] = ivec4(read_src(0)); + } + else if (vec_opcode != RSX_VEC_OPCODE_NOP) + { + vec4 value = read_src(0); + switch (vec_opcode) + { + case RSX_VEC_OPCODE_MOV: break; + case RSX_VEC_OPCODE_MUL: value *= read_src(1); break; + case RSX_VEC_OPCODE_ADD: value += read_src(2); break; + case RSX_VEC_OPCODE_MAD: value = fma(value, read_src(1), read_src(2)); break; + case RSX_VEC_OPCODE_DP3: value = vec4(dot(value.xyz, read_src(1).xyz)); break; + case RSX_VEC_OPCODE_DPH: value = vec4(dot(vec4(value.xyz, 1.0), read_src(1))); break; + case RSX_VEC_OPCODE_DP4: value = vec4(dot(value, read_src(1))); break; + case RSX_VEC_OPCODE_DST: value = _distance(value, read_src(1)); break; + case RSX_VEC_OPCODE_MIN: value = min(value, read_src(1)); break; + case RSX_VEC_OPCODE_MAX: value = max(value, read_src(1)); break; + case RSX_VEC_OPCODE_SLT: value = vec4(lessThan(value, read_src(1))); break; + case RSX_VEC_OPCODE_SGE: value = vec4(greaterThanEqual(value, read_src(1))); break; + case RSX_VEC_OPCODE_FRC: value = fract(value); break; + case RSX_VEC_OPCODE_FLR: value = floor(value); break; + case RSX_VEC_OPCODE_SEQ: value = vec4(equal(value, read_src(1))); break; + case RSX_VEC_OPCODE_SFL: value = vec4(0); break; + case RSX_VEC_OPCODE_SGT: value = vec4(greaterThan(value, read_src(1))); break; + case RSX_VEC_OPCODE_SLE: value = vec4(lessThanEqual(value, read_src(1))); break; + case RSX_VEC_OPCODE_SNE: value = vec4(notEqual(value, read_src(1))); break; + case RSX_VEC_OPCODE_STR: value = vec4(1); break; + case RSX_VEC_OPCODE_SSG: value = sign(value); break; + } + + write_vec(value); + } + + if (sca_opcode != RSX_SCA_OPCODE_NOP) + { + float value = read_src(2).x; + switch (sca_opcode) + { + case RSX_SCA_OPCODE_MOV: break; + case RSX_SCA_OPCODE_RCP: value = 1.0 / value; break; + case RSX_SCA_OPCODE_RCC: value = clamp(1.0 / value, 5.42101e-20, 1.884467e19); break; + case RSX_SCA_OPCODE_RSQ: value = 1.0 / sqrt(value); break; + case RSX_SCA_OPCODE_EXP: value = exp(value); break; + case RSX_SCA_OPCODE_LOG: value = log(value); break; + //case RSX_SCA_OPCODE_LIT: value = lit_legacy(value); break; + case RSX_SCA_OPCODE_LG2: value = log2(value); break; + case RSX_SCA_OPCODE_EX2: value = exp2(value); break; + case RSX_SCA_OPCODE_SIN: value = sin(value); break; + case RSX_SCA_OPCODE_COS: value = cos(value); break; + + case RSX_SCA_OPCODE_BRA: + // Jump by address register + if (dynamic_branch()) current_instruction = int(read_addr_reg().x); + continue; + case RSX_SCA_OPCODE_BRI: + // Jump immediate + if (dynamic_branch()) current_instruction = branch_addr(); + continue; + case RSX_SCA_OPCODE_CAL: + // Call immediate + if (dynamic_branch()) + { + callstack[stack_ptr] = current_instruction; + stack_ptr++; + current_instruction = branch_addr(); + } + continue; + case RSX_SCA_OPCODE_CLI: + // Unknown + continue; + case RSX_SCA_OPCODE_RET: + // Return + if (dynamic_branch()) + { + if (stack_ptr == 0) return; + current_instruction = callstack[stack_ptr]; + stack_ptr--; + } + continue; + case RSX_SCA_OPCODE_BRB: + // Branch by boolean mask + if (static_branch()) + { + current_instruction = branch_addr(); + } + continue; + case RSX_SCA_OPCODE_CLB: + // Call by boolean mask + if (static_branch()) + { + callstack[stack_ptr] = current_instruction; + stack_ptr++; + current_instruction = branch_addr(); + } + continue; + //case RSX_SCA_OPCODE_PSH: + //case RSX_SCA_OPCODE_POP: + } + + write_sca(value); + } + } + + // TODO: 2-sided lighting + if (attribute_enabled(1 << 0 | 1 << 2)) + { + diff_color = dest[1]; + diff_color1 = dest[1]; + } + + if (attribute_enabled(1 << 1 | 1 << 3)) + { + spec_color = dest[2]; + spec_color1 = dest[2]; + } + + if (attribute_enabled(1 << 4)) + { + fog_c = dest[5].xxxx; + } + + if (attribute_enabled(1 << 5)) + { + gl_PointSize = dest[6].x; + } + else + { + gl_PointSize = point_size; + } + + if (attribute_enabled(1 << 6 | 1 << 7 | 1 << 8)) + { + gl_ClipDistance[0] = (user_clip_enabled[0].x > 0)? dest[5].y * user_clip_factor[0].x : 0.5f; + gl_ClipDistance[1] = (user_clip_enabled[0].y > 0)? dest[5].z * user_clip_factor[0].y : 0.5f; + gl_ClipDistance[2] = (user_clip_enabled[0].z > 0)? dest[5].w * user_clip_factor[0].z : 0.5f; + } + + if (attribute_enabled(1 << 9 | 1 << 10 | 1 << 11)) + { + gl_ClipDistance[3] = (user_clip_enabled[0].w > 0)? dest[6].y * user_clip_factor[0].w : 0.5f; + gl_ClipDistance[4] = (user_clip_enabled[1].x > 0)? dest[6].z * user_clip_factor[1].x : 0.5f; + gl_ClipDistance[5] = (user_clip_enabled[1].y > 0)? dest[6].w * user_clip_factor[1].y : 0.5f; + } + + tc8 = write_output(15, 12); + tc9 = write_output(6, 13); + tc0 = write_output(7, 14); + tc1 = write_output(8, 15); + tc2 = write_output(9, 16); + tc3 = write_output(10, 17); + tc4 = write_output(11, 18); + tc5 = write_output(12, 19); + tc6 = write_output(13, 20); + tc7 = write_output(14, 21); + + vec4 pos = dest[0] * scale_offset_mat; + pos.z = (pos.z + pos.z) - pos.w; + gl_Position = pos; +} + +)" \ No newline at end of file diff --git a/rpcs3/Emu/RSX/Common/ShaderInterpreter.h b/rpcs3/Emu/RSX/Common/ShaderInterpreter.h new file mode 100644 index 0000000000..e9f00dc770 --- /dev/null +++ b/rpcs3/Emu/RSX/Common/ShaderInterpreter.h @@ -0,0 +1,24 @@ +#pragma once +#include "Utilities/StrFmt.h" + +namespace program_common +{ + namespace interpreter + { + std::string get_vertex_interpreter() + { + const char* s = + #include "Interpreter/VertexInterpreter.glsl" + ; + return s; + } + + std::string get_fragment_interpreter() + { + const char* s = + #include "Interpreter/FragmentInterpreter.glsl" + ; + return s; + } + } +} diff --git a/rpcs3/Emu/RSX/GL/GLFragmentProgram.h b/rpcs3/Emu/RSX/GL/GLFragmentProgram.h index 5e9f2acb2f..77b2612183 100644 --- a/rpcs3/Emu/RSX/GL/GLFragmentProgram.h +++ b/rpcs3/Emu/RSX/GL/GLFragmentProgram.h @@ -8,8 +8,15 @@ namespace glsl struct shader_properties; } +namespace gl +{ + class shader_interpreter; +} + struct GLFragmentDecompilerThread : public FragmentProgramDecompiler { + friend class gl::shader_interpreter; + std::string& m_shader; ParamArray& m_parrDummy; glsl::shader_properties m_shader_props{}; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 9b21d9eb02..3cc565f8e3 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -196,6 +196,8 @@ void GLGSRender::on_init_thread() m_texture_parameters_buffer = std::make_unique(); m_vertex_layout_buffer = std::make_unique(); m_index_ring_buffer = std::make_unique(); + m_vertex_instructions_buffer = std::make_unique(); + m_fragment_instructions_buffer = std::make_unique(); } else { @@ -207,6 +209,8 @@ void GLGSRender::on_init_thread() m_texture_parameters_buffer = std::make_unique(); m_vertex_layout_buffer = std::make_unique(); m_index_ring_buffer = std::make_unique(); + m_vertex_instructions_buffer = std::make_unique(); + m_fragment_instructions_buffer = std::make_unique(); } m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000); @@ -218,6 +222,14 @@ void GLGSRender::on_init_thread() m_texture_parameters_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); + if (g_cfg.video.shader_interpreter_mode != shader_interpreter_mode::disabled) + { + m_vertex_instructions_buffer->create(gl::buffer::target::ssbo, 16 * 0x100000); + m_fragment_instructions_buffer->create(gl::buffer::target::ssbo, 16 * 0x100000); + + m_shader_interpreter.create(); + } + if (gl_caps.vendor_AMD) { m_identity_index_buffer = std::make_unique(); @@ -427,6 +439,16 @@ void GLGSRender::on_exit() m_identity_index_buffer->remove(); } + if (m_vertex_instructions_buffer) + { + m_vertex_instructions_buffer->remove(); + } + + if (m_fragment_instructions_buffer) + { + m_fragment_instructions_buffer->remove(); + } + m_null_textures.clear(); m_text_printer.close(); m_gl_texture_cache.destroy(); @@ -434,6 +456,8 @@ void GLGSRender::on_exit() m_ui_renderer.destroy(); m_video_output_pass.destroy(); + m_shader_interpreter.destroy(); + for (u32 i = 0; i < occlusion_query_count; ++i) { auto &query = m_occlusion_query_data[i]; @@ -586,7 +610,8 @@ void GLGSRender::clear_surface(u32 arg) bool GLGSRender::load_program() { - if (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits) + const auto interpreter_mode = g_cfg.video.shader_interpreter_mode.get(); + if (m_interpreter_state = (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits)) { get_current_fragment_program(fs_sampler_state); verify(HERE), current_fragment_program.valid; @@ -596,40 +621,55 @@ bool GLGSRender::load_program() current_vertex_program.skip_vertex_input_check = true; //not needed for us since decoding is done server side current_fragment_program.unnormalized_coords = 0; //unused } - else if (m_program) + else if (m_program && + (m_program != m_shader_interpreter.get() || interpreter_mode == shader_interpreter_mode::forced)) { - // Program already loaded return true; } - void* pipeline_properties = nullptr; - m_program = m_prog_buffer.get_graphics_pipeline(current_vertex_program, current_fragment_program, pipeline_properties, + auto old_program = m_program; + if (interpreter_mode != shader_interpreter_mode::forced) [[likely]] + { + void* pipeline_properties = nullptr; + m_program = m_prog_buffer.get_graphics_pipeline(current_vertex_program, current_fragment_program, pipeline_properties, !g_cfg.video.disable_asynchronous_shader_compiler, true).get(); - if (m_prog_buffer.check_cache_missed()) - { - // Notify the user with HUD notification - if (g_cfg.misc.show_shader_compilation_hint) + if (m_prog_buffer.check_cache_missed()) { - if (m_overlay_manager) + // Notify the user with HUD notification + if (g_cfg.misc.show_shader_compilation_hint) { - if (auto dlg = m_overlay_manager->get()) + if (m_overlay_manager) { - // Extend duration - dlg->touch(); - } - else - { - // Create dialog but do not show immediately - m_overlay_manager->create(); + if (auto dlg = m_overlay_manager->get()) + { + // Extend duration + dlg->touch(); + } + else + { + // Create dialog but do not show immediately + m_overlay_manager->create(); + } } } } + else + { + verify(HERE), m_program; + m_program->sync(); + } } - else + + if (!m_program && interpreter_mode != shader_interpreter_mode::disabled) { - verify(HERE), m_program; - m_program->sync(); + // Fall back to interpreter + m_program = m_shader_interpreter.get(); + if (old_program != m_program) + { + // Program has changed, reupload + m_interpreter_state = rsx::invalidate_pipeline_bits; + } } return m_program != nullptr; @@ -649,6 +689,7 @@ void GLGSRender::load_program_env() const bool update_vertex_env = !!(m_graphics_state & rsx::pipeline_state::vertex_state_dirty); const bool update_fragment_env = !!(m_graphics_state & rsx::pipeline_state::fragment_state_dirty); const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty); + const bool update_instruction_buffers = (!!m_interpreter_state && m_program == m_shader_interpreter.get()); m_program->use(); @@ -659,6 +700,12 @@ void GLGSRender::load_program_env() if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256); if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(align(fragment_constants_size, 256)); if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192); + + if (update_instruction_buffers) + { + m_vertex_instructions_buffer->reserve_storage_on_heap(513 * 16); + m_fragment_instructions_buffer->reserve_storage_on_heap(current_fp_metadata.program_ucode_length); + } } if (update_vertex_env) @@ -686,7 +733,7 @@ void GLGSRender::load_program_env() m_transform_constants_buffer->bind_range(GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT, mapping.second, 8192); } - if (update_fragment_constants) + if (update_fragment_constants && !update_instruction_buffers) { // Fragment constants auto mapping = m_fragment_constants_buffer->alloc_from_heap(fragment_constants_size, m_uniform_buffer_offset_align); @@ -718,6 +765,49 @@ void GLGSRender::load_program_env() m_texture_parameters_buffer->bind_range(GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, mapping.second, 256); } + if (update_instruction_buffers) + { + if (m_interpreter_state & rsx::vertex_program_dirty) + { + // Attach vertex buffer data + const auto vp_block_length = current_vp_metadata.ucode_length + 16; + auto vp_mapping = m_vertex_instructions_buffer->alloc_from_heap(vp_block_length, 16); + auto vp_buf = static_cast(vp_mapping.first); + + auto vp_config = reinterpret_cast(vp_buf); + vp_config[0] = current_vertex_program.base_address; + vp_config[1] = current_vertex_program.entry; + vp_config[2] = current_vertex_program.output_mask; + + std::memcpy(vp_buf + 16, current_vertex_program.data.data(), current_vp_metadata.ucode_length); + + m_vertex_instructions_buffer->bind_range(GL_INTERPRETER_VERTEX_BLOCK, vp_mapping.second, vp_block_length); + m_vertex_instructions_buffer->notify(); + } + + if (m_interpreter_state & rsx::fragment_program_dirty) + { + // Attach fragment buffer data + const auto fp_block_length = current_fp_metadata.program_ucode_length + 80; + auto fp_mapping = m_fragment_instructions_buffer->alloc_from_heap(fp_block_length, 16); + auto fp_buf = static_cast(fp_mapping.first); + + // Control mask + const auto control_masks = reinterpret_cast(fp_buf); + control_masks[0] = rsx::method_registers.shader_control(); + control_masks[1] = current_fragment_program.texture_dimensions; + + // Bind textures + m_shader_interpreter.update_fragment_textures(fs_sampler_state, current_fp_metadata.referenced_textures_mask, reinterpret_cast(fp_buf + 16)); + + const auto fp_data = static_cast(current_fragment_program.addr) + current_fp_metadata.program_start_offset; + std::memcpy(fp_buf + 80, fp_data, current_fp_metadata.program_ucode_length); + + m_fragment_instructions_buffer->bind_range(GL_INTERPRETER_FRAGMENT_BLOCK, fp_mapping.second, fp_block_length); + m_fragment_instructions_buffer->notify(); + } + } + if (manually_flush_ring_buffers) { if (update_fragment_env) m_fragment_env_buffer->unmap(); @@ -725,6 +815,12 @@ void GLGSRender::load_program_env() if (update_fragment_texture_env) m_texture_parameters_buffer->unmap(); if (update_fragment_constants) m_fragment_constants_buffer->unmap(); if (update_transform_constants) m_transform_constants_buffer->unmap(); + + if (update_instruction_buffers) + { + m_vertex_instructions_buffer->unmap(); + m_fragment_instructions_buffer->unmap(); + } } const u32 handled_flags = (rsx::pipeline_state::fragment_state_dirty | rsx::pipeline_state::vertex_state_dirty | rsx::pipeline_state::transform_constants_dirty | rsx::pipeline_state::fragment_constants_dirty | rsx::pipeline_state::fragment_texture_state_dirty); diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 5df5236919..9f09224b3c 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -7,6 +7,7 @@ #include "GLProgramBuffer.h" #include "GLTextOut.h" #include "GLOverlays.h" +#include "GLShaderInterpreter.h" #include @@ -74,8 +75,10 @@ private: gl::sampler_state m_fs_sampler_mirror_states[rsx::limits::fragment_textures_count]; // Alternate views of fragment textures with different format (e.g Depth vs Stencil for D24S8) gl::sampler_state m_vs_sampler_states[rsx::limits::vertex_textures_count]; // Vertex textures - gl::glsl::program *m_program; - gl::glsl::program m_shader_interpreter; + gl::glsl::program *m_program = nullptr; + + u32 m_interpreter_state = 0; + gl::shader_interpreter m_shader_interpreter; gl_render_targets m_rtts; @@ -94,6 +97,8 @@ private: std::unique_ptr m_texture_parameters_buffer; std::unique_ptr m_vertex_layout_buffer; std::unique_ptr m_index_ring_buffer; + std::unique_ptr m_vertex_instructions_buffer; + std::unique_ptr m_fragment_instructions_buffer; // Identity buffer used to fix broken gl_VertexID on ATI stack std::unique_ptr m_identity_index_buffer; diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index 8354e2f159..235c7009d1 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -27,7 +27,9 @@ #define GL_FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 3 #define GL_FRAGMENT_STATE_BIND_SLOT 4 #define GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT 5 -#define GL_COMPUTE_BUFFER_SLOT(index) (index + 6) +#define GL_INTERPRETER_VERTEX_BLOCK 6 +#define GL_INTERPRETER_FRAGMENT_BLOCK 7 +#define GL_COMPUTE_BUFFER_SLOT(index) (index + 8) inline static void _SelectTexture(int unit) { glActiveTexture(GL_TEXTURE0 + unit); } @@ -2576,6 +2578,7 @@ public: void operator = (const color4f& rhs) const { glProgramUniform4f(m_program.id(), location(), rhs.r, rhs.g, rhs.b, rhs.a); } void operator = (const areaf& rhs) const { glProgramUniform4f(m_program.id(), location(), rhs.x1, rhs.y1, rhs.x2, rhs.y2); } void operator = (const areai& rhs) const { glProgramUniform4i(m_program.id(), location(), rhs.x1, rhs.y1, rhs.x2, rhs.y2); } + void operator = (const std::vector& rhs) const { glProgramUniform1iv(m_program.id(), location(), ::size32(rhs), rhs.data()); } }; class uniforms_t diff --git a/rpcs3/Emu/RSX/GL/GLShaderInterpreter.cpp b/rpcs3/Emu/RSX/GL/GLShaderInterpreter.cpp index 564412cbbc..b6eb5d9271 100644 --- a/rpcs3/Emu/RSX/GL/GLShaderInterpreter.cpp +++ b/rpcs3/Emu/RSX/GL/GLShaderInterpreter.cpp @@ -1,7 +1,320 @@ -#include "stdafx.h" +#include "stdafx.h" #include "GLShaderInterpreter.h" +#include "GLGSRender.h" +#include "GLVertexProgram.h" +#include "GLFragmentProgram.h" +#include "../Common/ShaderInterpreter.h" +#include "../Common/GLSLCommon.h" namespace gl { + using glsl::shader; + namespace interpreter + { + void texture_pool_allocator::create(shader::type domain) + { + GLenum pname; + switch (domain) + { + default: + rsx_log.fatal("Unexpected program domain %d", static_cast(domain)); + case shader::type::vertex: + pname = GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS; break; + case shader::type::fragment: + pname = GL_MAX_TEXTURE_IMAGE_UNITS; break; + } + + glGetIntegerv(pname, &max_image_units); + } + + void texture_pool_allocator::allocate(int size) + { + if ((used + size) > max_image_units) + { + rsx_log.fatal("Out of image binding slots!"); + } + + used += size; + texture_pool pool; + pool.pool_size = size; + pools.push_back(pool); + } + } + + void shader_interpreter::create() + { + texture_pools[0].create(shader::type::vertex); + texture_pools[1].create(shader::type::fragment); + + build_vs(); + build_fs(); + + program_handle.create(). + attach(vs). + attach(fs). + link(); + + program_handle.uniforms[0] = GL_STREAM_BUFFER_START + 0; + program_handle.uniforms[1] = GL_STREAM_BUFFER_START + 1; + + // Initialize texture bindings + int assigned = 0; + auto& allocator = texture_pools[1]; + const char* type_names[] = { "sampler1D_array", "sampler2D_array", "samplerCube_array", "sampler3D_array" }; + + for (int i = 0; i < 4; ++i) + { + for (int j = 0; j < allocator.pools[i].pool_size; ++j) + { + allocator.pools[i].allocate(assigned++); + } + + program_handle.uniforms[type_names[i]] = allocator.pools[i].allocated; + } + } + + void shader_interpreter::destroy() + { + program_handle.remove(); + vs.remove(); + fs.remove(); + } + + glsl::program* shader_interpreter::get() + { + return &program_handle; + } + + void shader_interpreter::build_vs() + { + ::glsl::shader_properties properties{}; + properties.domain = ::glsl::program_domain::glsl_vertex_program; + properties.require_lit_emulation = true; + + // TODO: Extend decompiler thread + // TODO: Rename decompiler thread, it no longer spawns a thread + RSXVertexProgram null_prog; + std::string shader_str; + ParamArray arr; + GLVertexDecompilerThread comp(null_prog, shader_str, arr); + + std::stringstream builder; + comp.insertHeader(builder); + comp.insertConstants(builder, {}); + comp.insertInputs(builder, {}); + comp.insertOutputs(builder, {}); + + // Insert vp stream input + builder << "\n" + "layout(std140, binding = " << GL_INTERPRETER_VERTEX_BLOCK << ") readonly restrict buffer VertexInstructionBlock\n" + "{\n" + " uint base_address;\n" + " uint entry;\n" + " uint output_mask;\n" + " uint reserved;\n" + " uvec4 vp_instructions[];\n" + "};\n\n"; + + ::glsl::insert_glsl_legacy_function(builder, properties); + ::glsl::insert_vertex_input_fetch(builder, ::glsl::glsl_rules::glsl_rules_opengl4); + + builder << program_common::interpreter::get_vertex_interpreter(); + const std::string s = builder.str(); + + vs.create(glsl::shader::type::vertex); + vs.source(s); + vs.compile(); + } + + void shader_interpreter::build_fs() + { + // Allocate TIUs + auto& allocator = texture_pools[1]; + if (allocator.max_image_units >= 32) + { + // 16 + 4 + 4 + 4 + allocator.allocate(4); // 1D + allocator.allocate(16); // 2D + allocator.allocate(4); // CUBE + allocator.allocate(4); // 3D + } + else if (allocator.max_image_units >= 24) + { + // 16 + 4 + 2 + 2 + allocator.allocate(2); // 1D + allocator.allocate(16); // 2D + allocator.allocate(2); // CUBE + allocator.allocate(4); // 3D + } + else if (allocator.max_image_units >= 16) + { + // 10 + 2 + 2 + 2 + allocator.allocate(2); // 1D + allocator.allocate(10); // 2D + allocator.allocate(2); // CUBE + allocator.allocate(2); // 3D + } + else + { + // Unusable + rsx_log.fatal("Failed to allocate enough TIUs for shader interpreter."); + } + + ::glsl::shader_properties properties{}; + properties.domain = ::glsl::program_domain::glsl_fragment_program; + properties.require_depth_conversion = true; + properties.require_wpos = true; + + u32 len; + ParamArray arr; + std::string shader_str; + RSXFragmentProgram frag; + GLFragmentDecompilerThread comp(shader_str, arr, frag, len); + + std::stringstream builder; + builder << + "#version 450\n" + "#extension GL_ARB_bindless_texture : require\n\n"; + + ::glsl::insert_subheader_block(builder); + comp.insertConstants(builder); + + // Declare custom inputs + builder << + "layout(location=1) in vec4 in_regs[15];\n\n"; + + const char* type_names[] = { "sampler1D", "sampler2D", "samplerCube", "sampler3D" }; + for (int i = 0; i < 4; ++i) + { + builder << "uniform " << type_names[i] << " " << type_names[i] << "_array[" << allocator.pools[i].pool_size << "];\n"; + } + + builder << "\n" + "#define IS_TEXTURE_RESIDENT(index) (texture_handles[index] < 0xFF)\n" + "#define SAMPLER1D(index) sampler1D_array[texture_handles[index]]\n" + "#define SAMPLER2D(index) sampler2D_array[texture_handles[index]]\n" + "#define SAMPLER3D(index) sampler3D_array[texture_handles[index]]\n" + "#define SAMPLERCUBE(index) samplerCube_array[texture_handles[index]]\n\n"; + + builder << + "layout(std430, binding =" << GL_INTERPRETER_FRAGMENT_BLOCK << ") readonly restrict buffer FragmentInstructionBlock\n" + "{\n" + " uint shader_control;\n" + " uint texture_control;\n" + " uint reserved1;\n" + " uint reserved2;\n" + " uint texture_handles[16];\n" + " uvec4 fp_instructions[];\n" + "};\n\n"; + + ::program_common::insert_fog_declaration(builder, "vec4", "fogc", true); + + builder << program_common::interpreter::get_fragment_interpreter(); + const std::string s = builder.str(); + + fs.create(glsl::shader::type::fragment); + fs.source(s); + fs.compile(); + } + + void shader_interpreter::update_fragment_textures( + const std::array, 16>& descriptors, + u16 reference_mask, u32* out) + { + if (reference_mask == 0) + { + return; + } + + // Reset allocation + auto& allocator = texture_pools[1]; + for (unsigned i = 0; i < 4; ++i) + { + allocator.pools[i].num_used = 0; + allocator.pools[i].flags = 0; + } + + rsx::simple_array> replacement_map; + for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) + { + if (reference_mask & (1 << i)) + { + auto sampler_state = static_cast(descriptors[i].get()); + verify(HERE), sampler_state; + + int pool_id = static_cast(sampler_state->image_type); + auto& pool = allocator.pools[pool_id]; + + const int old = pool.allocated[pool.num_used]; + if (!pool.allocate(i)) + { + rsx_log.error("Could not allocate texture resource for shader interpreter."); + break; + } + + out[i] = (pool.num_used - 1); + if (old != i) + { + // Check if the candidate target has also been replaced + bool found = false; + for (auto& e : replacement_map) + { + if (e.second == old) + { + // This replacement consumed this 'old' value + e.second = i; + found = true; + break; + } + } + + if (!found) + { + replacement_map.push_back({ old, i }); + } + } + } + else + { + out[i] = 0xFF; + } + } + + // Bind TIU locations + if (replacement_map.empty()) [[likely]] + { + return; + } + + if (get_driver_caps().vendor_AMD) + { + // AMD drivers don't like texture bindings overlapping which means workarounds are needed + // Technically this is accurate to spec, but makes efficient usage of shader resources difficult + for (unsigned i = 0; i < replacement_map.size(); ++i) + { + for (int j = 0; j < 4; ++j) + { + auto& pool = allocator.pools[j]; + for (int k = pool.num_used; k < pool.pool_size; ++k) + { + if (pool.allocated[k] == replacement_map[i].second) + { + pool.allocated[k] = replacement_map[i].first; + pool.flags |= static_cast(interpreter::texture_pool_flags::dirty); + + // Exit nested loop + j = 4; + break; + } + } + } + } + } + + if (allocator.pools[0].flags) program_handle.uniforms["sampler1D_array"] = allocator.pools[0].allocated; + if (allocator.pools[1].flags) program_handle.uniforms["sampler2D_array"] = allocator.pools[1].allocated; + if (allocator.pools[2].flags) program_handle.uniforms["samplerCube_array"] = allocator.pools[2].allocated; + if (allocator.pools[3].flags) program_handle.uniforms["sampler3D_array"] = allocator.pools[3].allocated; + } } diff --git a/rpcs3/Emu/RSX/GL/GLShaderInterpreter.h b/rpcs3/Emu/RSX/GL/GLShaderInterpreter.h index a53dab1f07..c7e26542d6 100644 --- a/rpcs3/Emu/RSX/GL/GLShaderInterpreter.h +++ b/rpcs3/Emu/RSX/GL/GLShaderInterpreter.h @@ -1,15 +1,71 @@ -#pragma once -#include "GLGSRender.h" +#pragma once +#include "GLHelpers.h" namespace gl { - class shader_interpreter : glsl::program + namespace interpreter + { + enum class texture_pool_flags + { + dirty = 1 + }; + + struct texture_pool + { + int pool_size = 0; + int num_used = 0; + u32 flags = 0; + std::vector allocated; + + bool allocate(int value) + { + if (num_used >= pool_size) + { + return false; + } + + if (allocated.size() == num_used) + { + allocated.push_back(value); + } + else + { + allocated[num_used] = value; + } + + num_used++; + flags |= static_cast(texture_pool_flags::dirty); + return true; + } + }; + + struct texture_pool_allocator + { + int max_image_units = 0; + int used = 0; + std::vector pools; + + void create(::gl::glsl::shader::type domain); + void allocate(int size); + }; + } + + class shader_interpreter { glsl::shader vs; glsl::shader fs; + glsl::program program_handle; + interpreter::texture_pool_allocator texture_pools[2]; + + void build_vs(); + void build_fs(); public: void create(); void destroy(); + + void update_fragment_textures(const std::array, 16>& descriptors, u16 reference_mask, u32* out); + + glsl::program* get(); }; } diff --git a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp index 09d15fd7b8..38c0ba5735 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp +++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp @@ -46,7 +46,7 @@ void GLVertexDecompilerThread::insertHeader(std::stringstream &OS) OS << "layout(std140, binding = 1) uniform VertexLayoutBuffer\n"; OS << "{\n"; OS << " uint vertex_base_index;\n"; - OS << " uint vertex_index_offset;\n"; + OS << " uint vertex_index_offset;\n"; OS << " uvec4 input_attributes_blob[16 / 2];\n"; OS << "};\n\n"; } diff --git a/rpcs3/Emu/RSX/GL/GLVertexProgram.h b/rpcs3/Emu/RSX/GL/GLVertexProgram.h index bad54b5fff..ac9817e9d1 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexProgram.h +++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "../Common/VertexProgramDecompiler.h" #include "Emu/RSX/RSXVertexProgram.h" @@ -11,8 +11,15 @@ enum GL_VP_SINT_MASK = (GL_VP_ATTRIB_S16_INT|GL_VP_ATTRIB_S32_INT) }; +namespace gl +{ + class shader_interpreter; +}; + struct GLVertexDecompilerThread : public VertexProgramDecompiler { + friend class gl::shader_interpreter; + std::string &m_shader; protected: std::string getFloatTypeName(size_t elementCount) override; diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 2a2a4ff99b..30d8799b53 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -132,6 +132,7 @@ struct cfg_root : cfg::node cfg::_bool enable_3d{ this, "Enable 3D", false }; cfg::_int<1, 8> consecutive_frames_to_draw{ this, "Consecutive Frames To Draw", 1, true}; cfg::_int<1, 8> consecutive_frames_to_skip{ this, "Consecutive Frames To Skip", 1, true}; + cfg::_bool enable_shader_interpreter{ this, "Enable Shader Interpreter", true }; cfg::_int<50, 800> resolution_scale_percent{ this, "Resolution Scale", 100 }; cfg::_int<0, 16> anisotropic_level_override{ this, "Anisotropic Filter Override", 0, true }; cfg::_int<1, 1024> min_scalable_dimension{ this, "Minimum Scalable Dimension", 16 }; diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp index cbafec747a..bb9a7387ee 100644 --- a/rpcs3/Emu/system_config_types.cpp +++ b/rpcs3/Emu/system_config_types.cpp @@ -387,3 +387,18 @@ void fmt_class_string::format(std::string& out, u64 arg) return unknown; }); } + +void fmt_class_string::format(std::string& out, u64 arg) +{ + format_enum(out, arg, [](shader_interpreter_mode value) + { + switch (value) + { + case shader_interpreter_mode::disabled: return "Disabled"; + case shader_interpreter_mode::enabled: return "Enabled"; + case shader_interpreter_mode::forced: return "Forced"; + } + + return unknown; + }); +} diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 00f5f87308..a0c2e9e06c 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -436,6 +436,7 @@ + @@ -720,6 +721,10 @@ {fdc361c5-7734-493b-8cfb-037308b35122} + + + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index bf1d90f583..df4d2aa8d4 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -75,6 +75,9 @@ {652ce43e-72db-42cd-831a-0e194f67e731} + + {bc97b324-1eea-445a-8fa9-6fc49e3df47c} + @@ -868,7 +871,10 @@ Utilities - + + + Utilities + Utilities @@ -1758,7 +1764,7 @@ Emu\GPU\RSX\Overlays - + Utilities @@ -1782,6 +1788,17 @@ Utilities + + + Emu\GPU\RSX\Common + + + Emu\GPU\RSX\Common\Interpreter + + + Emu\GPU\RSX\Common\Interpreter + + \ No newline at end of file