From 63235b3f304b3180cb8d142c34939dc6237e5886 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 19 Jun 2023 23:07:17 +0300 Subject: [PATCH 01/25] rsx: Add the glsl files - Generated from inline strings in GLSLCommon.cpp --- .../GLSLSnippets/RSXProg/RSXDefines2.glsl | 10 ++ .../RSXProg/RSXFragmentPrologue.glsl | 42 ++++++++ .../RSXFragmentTextureDepthConversion.glsl | 69 +++++++++++++ .../RSXProg/RSXFragmentTextureMSAAOps.glsl | 18 ++++ .../RSXFragmentTextureMSAAOpsInternal.glsl | 94 ++++++++++++++++++ .../RSXProg/RSXFragmentTextureOps.glsl | 98 +++++++++++++++++++ .../RSXProg/RSXProgramCommon.glsl | 27 +++++ .../RSXProg/RSXVertexPrologue.glsl | 58 +++++++++++ rpcs3/emucore.vcxproj | 8 ++ rpcs3/emucore.vcxproj.filters | 27 +++++ 10 files changed, 451 insertions(+) create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOpsInternal.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl new file mode 100644 index 000000000000..eaf21d252c22 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl @@ -0,0 +1,10 @@ +R"( +// Small structures that should be defined before any backend logic +struct sampler_info +{ + vec4 scale_bias; + uint remap; + uint flags; +}; + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl new file mode 100644 index 000000000000..b3c98e087034 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl @@ -0,0 +1,42 @@ +R"( + +#ifdef _32_BIT_OUTPUT +// Default. Used when we're not utilizing native fp16 +#define round_to_8bit(v4) (floor(fma(v4, vec4(255.), vec4(0.5))) / vec4(255.)) +#else +// FP16 version +#define round_to_8bit(v4) (floor(fma(v4, f16vec4(255.), f16vec4(0.5))) / f16vec4(255.)) +#endif + +#ifdef _DISABLE_EARLY_DISCARD +#define kill() _fragment_discard = true +#else +#define kill() discard +#endif + +#ifdef _ENABLE_WPOS +vec4 get_wpos() +{ + float abs_scale = abs(wpos_scale); + return (gl_FragCoord * vec4(abs_scale, wpos_scale, 1., 1.)) + vec4(0., wpos_bias, 0., 0.); +} +#endif + +// Required by all fragment shaders for alpha test +bool comparison_passes(const in float a, const in float b, const in uint func) +{ + switch (func) + { + default: + case 0: return false; //never + case 1: return (CMP_FIXUP(a) < CMP_FIXUP(b)); //less + case 2: return (CMP_FIXUP(a) == CMP_FIXUP(b)); //equal + case 3: return (CMP_FIXUP(a) <= CMP_FIXUP(b)); //lequal + case 4: return (CMP_FIXUP(a) > CMP_FIXUP(b)); //greater + case 5: return (CMP_FIXUP(a) != CMP_FIXUP(b)); //nequal + case 6: return (CMP_FIXUP(a) >= CMP_FIXUP(b)); //gequal + case 7: return true; //always + } +} + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl new file mode 100644 index 000000000000..68cea535ce05 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl @@ -0,0 +1,69 @@ +R"( +#define ZS_READ(index, coord) vec2(texture(TEX_NAME(index), coord).r, float(texture(TEX_NAME_STENCIL(index), coord).x)) +#define TEX1D_Z24X8_RGBA8(index, coord1) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE1(index, coord1)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX2D_Z24X8_RGBA8(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE2(index, coord2)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX3D_Z24X8_RGBA8(index, coord3) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE3(index, coord3)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) + +// NOTE: Memory layout is fetched as byteswapped BGRA [GBAR] (GOW collection, DS2, DeS) +// The A component (Z) is useless (should contain stencil8 or just 1) +vec4 decode_depth24(const in float depth_value, const in bool depth_float) +{ + uint value; + if (!depth_float) + { + value = uint(depth_value * 16777215.); + } + else + { + value = _get_bits(floatBitsToUint(depth_value), 7, 24); + } + + uint b = _get_bits(value, 0, 8); + uint g = _get_bits(value, 8, 8); + uint r = _get_bits(value, 16, 8); + const vec4 color = vec4(float(g), float(b) , 1., float(r)); + const vec4 scale = vec4(255., 255., 1., 255.); + return color / scale; +} + +vec4 remap_vector(const in vec4 color, const in uint remap) +{ + vec4 result; + if (_get_bits(remap, 0, 8) == 0xE4) + { + result = color; + } + else + { + uvec4 remap_channel = uvec4(remap) >> uvec4(2, 4, 6, 0); + remap_channel &= 3; + remap_channel = (remap_channel + 3) % 4; // Map A-R-G-B to R-G-B-A + + // Generate remapped result + result.a = color[remap_channel.a]; + result.r = color[remap_channel.r]; + result.g = color[remap_channel.g]; + result.b = color[remap_channel.b]; + } + + if (_get_bits(remap, 8, 8) == 0xAA) + return result; + + uvec4 remap_select = uvec4(remap) >> uvec4(10, 12, 14, 8); + remap_select &= 3; + bvec4 choice = lessThan(remap_select, uvec4(2)); + return _select(result, vec4(remap_select), choice); +} + +vec4 convert_z24x8_to_rgba8(const in vec2 depth_stencil, const in uint remap, const in uint flags) +{ + vec4 result = decode_depth24(depth_stencil.x, _test_bit(flags, DEPTH_FLOAT)); + result.z = depth_stencil.y / 255.; + + if (remap == 0xAAE4) + return result; + + return remap_vector(result, remap); +} + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl new file mode 100644 index 000000000000..786605a3c077 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl @@ -0,0 +1,18 @@ +R"( +#define ZCOMPARE_FUNC(index) _get_bits(TEX_FLAGS(index), DEPTH_COMPARE, 3) +#define ZS_READ_MS(index, coord) vec2(sampleTexture2DMS(TEX_NAME(index), coord, index).r, float(sampleTexture2DMS(TEX_NAME_STENCIL(index), coord, index).x)) +#define TEX2D_MS(index, coord2) process_texel(sampleTexture2DMS(TEX_NAME(index), coord2, index), TEX_FLAGS(index)) +#define TEX2D_SHADOW_MS(index, coord3) vec4(comparison_passes(sampleTexture2DMS(TEX_NAME(index), coord3.xy, index).x, coord3.z, ZCOMPARE_FUNC(index))) +#define TEX2D_SHADOWPROJ_MS(index, coord4) TEX2D_SHADOW_MS(index, (coord4.xyz / coord4.w)) +#define TEX2D_Z24X8_RGBA8_MS(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n; + +vec3 compute2x2DownsampleWeights(const in float coord, const in float uv_step, const in float actual_step) +{ + const float next_sample_point = coord + actual_step; + const float next_coord_step = fma(floor(coord / uv_step), uv_step, uv_step); + const float next_coord_step_plus_one = next_coord_step + uv_step; + vec3 weights = vec3(next_coord_step, min(next_coord_step_plus_one, next_sample_point), max(next_coord_step_plus_one, next_sample_point)) - vec3(coord, next_coord_step, next_coord_step_plus_one); + return weights / actual_step; +} + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOpsInternal.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOpsInternal.glsl new file mode 100644 index 000000000000..7f9370ba4997 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOpsInternal.glsl @@ -0,0 +1,94 @@ +R"( +vec4 texelFetch2DMS(in _MSAA_SAMPLER_TYPE_ tex, const in vec2 sample_count, const in ivec2 icoords, const in int index, const in ivec2 offset) +{ + const vec2 resolve_coords = vec2(icoords + offset); + const vec2 aa_coords = floor(resolve_coords / sample_count); // AA coords = real_coords / sample_count + const vec2 sample_loc = fma(aa_coords, -sample_count, resolve_coords); // Sample ID = real_coords % sample_count + const float sample_index = fma(sample_loc.y, sample_count.y, sample_loc.x); + return texelFetch(tex, ivec2(aa_coords), int(sample_index)); +} + +vec4 sampleTexture2DMS(in _MSAA_SAMPLER_TYPE_ tex, const in vec2 coords, const in int index) +{ + const uint flags = TEX_FLAGS(index); + const vec2 normalized_coords = COORD_SCALE2(index, coords); + const vec2 sample_count = vec2(2., textureSamples(tex) * 0.5); + const vec2 image_size = textureSize(tex) * sample_count; + const ivec2 icoords = ivec2(normalized_coords * image_size); + const vec4 sample0 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(0)); + + if (_get_bits(flags, FILTERED_MAG_BIT, 2) == 0) + { + return sample0; + } + + // Bilinear scaling, with upto 2x2 downscaling with simple weights + const vec2 uv_step = 1.0 / vec2(image_size); + const vec2 actual_step = vec2(dFdx(normalized_coords.x), dFdy(normalized_coords.y)); + + const bvec2 no_filter = lessThan(abs(uv_step - actual_step), vec2(0.000001)); + if (no_filter.x && no_filter.y) + { + return sample0; + } + + vec4 a, b; + float factor; + const vec4 sample2 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(0, 1)); // Top left + + if (no_filter.x) + { + // No scaling, 1:1 + a = sample0; + b = sample2; + } + else + { + // Filter required, sample more data + const vec4 sample1 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(1, 0)); // Bottom right + const vec4 sample3 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(1, 1)); // Top right + + if (actual_step.x > uv_step.x) + { + // Downscale in X, centered + const vec3 weights = compute2x2DownsampleWeights(normalized_coords.x, uv_step.x, actual_step.x); + + const vec4 sample4 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(2, 0)); // Further bottom right + a = fma(sample0, weights.xxxx, sample1 * weights.y) + (sample4 * weights.z); // Weighted sum + + if (!no_filter.y) + { + const vec4 sample5 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(2, 1)); // Further top right + b = fma(sample2, weights.xxxx, sample3 * weights.y) + (sample5 * weights.z); // Weighted sum + } + } + else if (actual_step.x < uv_step.x) + { + // Upscale in X + factor = fract(normalized_coords.x * image_size.x); + a = mix(sample0, sample1, factor); + b = mix(sample2, sample3, factor); + } + } + + if (no_filter.y) + { + // 1:1 no scale + return a; + } + else if (actual_step.y > uv_step.y) + { + // Downscale in Y + const vec3 weights = compute2x2DownsampleWeights(normalized_coords.y, uv_step.y, actual_step.y); + // We only have 2 rows computed for performance reasons, so combine rows 1 and 2 + return a * weights.x + b * (weights.y + weights.z); + } + else if (actual_step.y < uv_step.y) + { + // Upscale in Y + factor = fract(normalized_coords.y * image_size.y); + return mix(a, b, factor); + } +} + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl new file mode 100644 index 000000000000..df897eeef42c --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl @@ -0,0 +1,98 @@ +R"( +#ifdef _ENABLE_TEXTURE_EXPAND + uint _texture_flag_override = 0; + #define _enable_texture_expand() _texture_flag_override = SIGN_EXPAND_MASK + #define _disable_texture_expand() _texture_flag_override = 0 + #define TEX_FLAGS(index) (texture_parameters[index].flags | _texture_flag_override) +#else + #define TEX_FLAGS(index) texture_parameters[index].flags +#endif + +#define TEX_NAME(index) tex##index +#define TEX_NAME_STENCIL(index) tex##index##_stencil + +#define COORD_SCALE1(index, coord1) ((coord1 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.x) +#define COORD_SCALE2(index, coord2) ((coord2 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xy) +#define COORD_SCALE3(index, coord3) ((coord3 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xyz) + +#define TEX1D(index, coord1) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1)), TEX_FLAGS(index)) +#define TEX1D_BIAS(index, coord1, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1), bias), TEX_FLAGS(index)) +#define TEX1D_LOD(index, coord1, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE1(index, coord1), lod), TEX_FLAGS(index)) +#define TEX1D_GRAD(index, coord1, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE1(index, coord1), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX1D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec2(COORD_SCALE1(index, coord4.x), coord4.w)), TEX_FLAGS(index)) + +#define TEX2D(index, coord2) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2)), TEX_FLAGS(index)) +#define TEX2D_BIAS(index, coord2, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2), bias), TEX_FLAGS(index)) +#define TEX2D_LOD(index, coord2, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE2(index, coord2), lod), TEX_FLAGS(index)) +#define TEX2D_GRAD(index, coord2, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE2(index, coord2), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX2D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.z, coord4.w)), TEX_FLAGS(index)) + +#ifdef _EMULATED_TEXSHADOW + #define SHADOW_COORD(index, coord3) vec3(COORD_SCALE2(index, coord3.xy), _test_bit(TEX_FLAGS(index), DEPTH_FLOAT)? coord3.z : min(float(coord3.z), 1.0)) + #define SHADOW_COORD4(index, coord4) vec4(SHADOW_COORD(index, coord4.xyz), coord4.w) + #define SHADOW_COORD_PROJ(index, coord4) vec4(COORD_SCALE2(index, coord4.xy), _test_bit(TEX_FLAGS(index), DEPTH_FLOAT)? coord4.z : min(coord4.z, coord4.w), coord4.w) + + #define TEX2D_SHADOW(index, coord3) texture(TEX_NAME(index), SHADOW_COORD(index, coord3)) + #define TEX3D_SHADOW(index, coord4) texture(TEX_NAME(index), SHADOW_COORD4(index, coord4)) + #define TEX2D_SHADOWPROJ(index, coord4) textureProj(TEX_NAME(index), SHADOW_COORD_PROJ(index, coord4)) +#else + #define TEX2D_SHADOW(index, coord3) texture(TEX_NAME(index), vec3(COORD_SCALE2(index, coord3.xy), coord3.z)) + #define TEX3D_SHADOW(index, coord4) texture(TEX_NAME(index), vec4(COORD_SCALE3(index, coord4.xyz), coord4.w)) + #define TEX2D_SHADOWPROJ(index, coord4) textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.zw)) +#endif + +#define TEX3D(index, coord3) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3)), TEX_FLAGS(index)) +#define TEX3D_BIAS(index, coord3, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3), bias), TEX_FLAGS(index)) +#define TEX3D_LOD(index, coord3, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE3(index, coord3), lod), TEX_FLAGS(index)) +#define TEX3D_GRAD(index, coord3, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE3(index, coord3), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX3D_PROJ(index, coord4) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord4.xyz) / coord4.w), TEX_FLAGS(index)) + +vec4 process_texel(in vec4 rgba, const in uint control_bits) +{ + if (control_bits == 0) + { + return rgba; + } + + if (_test_bit(control_bits, ALPHAKILL)) + { + // Alphakill + if (rgba.a < 0.000001) + { + _kill(); + return rgba; + } + } + + if (_test_bit(control_bits, RENORMALIZE)) + { + // Renormalize to 8-bit (PS3) accuracy + rgba = floor(rgba * 255.); + rgba /= 255.; + } + + uvec4 mask; + vec4 convert; + uint op_mask = control_bits & uint(SIGN_EXPAND_MASK); + + if (op_mask != 0) + { + // Expand to signed normalized + mask = uvec4(op_mask) & uvec4(EXPAND_R_MASK, EXPAND_G_MASK, EXPAND_B_MASK, EXPAND_A_MASK); + convert = (rgba * 2.f - 1.f); + rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); + } + + op_mask = control_bits & uint(GAMMA_CTRL_MASK); + if (op_mask != 0u) + { + // Gamma correction + mask = uvec4(op_mask) & uvec4(GAMMA_R_MASK, GAMMA_G_MASK, GAMMA_B_MASK, GAMMA_A_MASK); + convert = srgb_to_linear(rgba); + return _select(rgba, convert, notEqual(mask, uvec4(0))); + } + + return rgba; +} + +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl new file mode 100644 index 000000000000..887212f1d2cf --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl @@ -0,0 +1,27 @@ +R"( +#define _select mix +#define _saturate(x) clamp(x, 0., 1.) +#define _get_bits(x, off, count) bitfieldExtract(x, off, count) +#define _set_bits(x, y, off, count) bitfieldInsert(x, y, off, count) +#define _test_bit(x, y) (_get_bits(x, y, 1) != 0) +#define _rand(seed) fract(sin(dot(seed.xy, vec2(12.9898f, 78.233f))) * 43758.5453f) + +#ifdef _GPU_LOW_PRECISION_COMPARE +#define CMP_FIXUP(a) (sign(a) * 16. + a) +#else +#define CMP_FIXUP(a) (a) +#endif + +#ifdef _ENABLE_LIT_EMULATION +vec4 lit_legacy(const in vec4 val) +{ + vec4 clamped_val = vec4(max(val.xy, vec2(0.)), val.zw); + return vec4( + 1., + clamped_val.x, + exp2(clamped_val.w * log2(max(clamped_val.y, 0.0000000001))) * sign(clamped_val.x), + 1.); +} +#endif + +)" \ No newline at end of file diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl new file mode 100644 index 000000000000..23e272625fc0 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl @@ -0,0 +1,58 @@ +R"( +#ifdef _FORCE_POSITION_INVARIANCE +invariant gl_Position; +#endif + +#ifdef _EMULATE_ZCLIP_XFORM_STANDARD +// Technically the depth value here is the 'final' depth that should be stored in the Z buffer. +// Forward mapping eqn is d' = d * (f - n) + n, where d' is the stored Z value (this) and d is the normalized API value. +vec4 apply_zclip_xform( + const in vec4 pos, + const in float near_plane, + const in float far_plane) +{ + if (pos.w != 0.0) + { + const float real_n = min(far_plane, near_plane); + const float real_f = max(far_plane, near_plane); + const double depth_range = double(real_f - real_n); + const double inv_range = (depth_range > 0.000001) ? (1.0 / (depth_range * pos.w)) : 0.0; + const double actual_d = (double(pos.z) - double(real_n * pos.w)) * inv_range; + const double nearest_d = floor(actual_d + 0.5); + const double epsilon = (inv_range * pos.w) / 16777215.; // Epsilon value is the minimum discernable change in Z that should affect the stored Z + const double d = _select(actual_d, nearest_d, abs(actual_d - nearest_d) < epsilon); + return vec4(pos.xy, float(d * pos.w), pos.w); + } + else + { + return pos; // Only values where Z=0 can ever pass this clip + } +} +#elif defined(_EMULATE_ZCLIP_XFORM_FALLBACK) +vec4 apply_zclip_xform( + const in vec4 pos, + const in float near_plane, + const in float far_plane) +{ + float d = float(pos.z / pos.w); + if (d < 0.f && d >= near_plane) + { + // Clamp + d = 0.f; + } + else if (d > 1.f && d <= far_plane) + { + // Compress Z and store towards highest end of the range + d = min(1., 0.99 + (0.01 * (pos.z - near_plane) / (far_plane - near_plane))); + } + else // This catch-call also handles w=0 since d=inf + { + return pos; + } + + return vec4(pos.x, pos.y, d * pos.w, pos.w); +}\n +#endif + + +)" diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 7298809bc156..8d3d6c04d16b 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -904,6 +904,14 @@ + + + + + + + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index a3505ea837f7..d219df8c1839 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -85,6 +85,9 @@ {017e5a5d-b190-4032-baed-57f8020861a5} + + {f990b0be-adbd-4a1c-b8c7-d6f963d5b629} + @@ -2403,5 +2406,29 @@ Emu\GPU\RSX\Program\Snippets + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + + + Emu\GPU\RSX\Program\Snippets\RSXProg + \ No newline at end of file From 4b2d99d1d562b0149f350537abaab3f5dfd7d213 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 00:20:51 +0300 Subject: [PATCH 02/25] rsx: Switch common codegen to use the glsl scripts --- rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp | 4 +- rpcs3/Emu/RSX/Program/GLSLCommon.cpp | 645 ++++-------------- rpcs3/Emu/RSX/Program/GLSLCommon.h | 7 - .../RSXProg/RSXFragmentPrologue.glsl | 73 +- .../RSXProg/RSXProgramCommon.glsl | 2 +- .../RSXProg/RSXVertexPrologue.glsl | 2 +- rpcs3/Emu/RSX/Program/GLSLTypes.h | 1 + rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp | 4 +- 8 files changed, 193 insertions(+), 545 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp b/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp index a0e610bae55b..23c4f9095499 100644 --- a/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp @@ -191,6 +191,7 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.require_texture_expand = properties.has_exp_tex_op; m_shader_props.require_srgb_to_linear = properties.has_upg; m_shader_props.require_linear_to_srgb = properties.has_pkg; + m_shader_props.require_fog_read = properties.in_register_mask & in_fogc; m_shader_props.emulate_coverage_tests = true; // g_cfg.video.antialiasing_level == msaa_level::none; m_shader_props.emulate_shadow_compare = device_props.emulate_depth_compare; m_shader_props.low_precision_tests = ::gl::get_driver_caps().vendor_NVIDIA && !(m_prog.ctrl & RSX_SHADER_CONTROL_ATTRIBUTE_INTERPOLATION); @@ -203,9 +204,6 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) void GLFragmentDecompilerThread::insertMainStart(std::stringstream & OS) { - if (properties.in_register_mask & in_fogc) - program_common::insert_fog_declaration(OS); - std::set output_registers; if (m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) { diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp index 180ea9d74b57..6a511b56ca2e 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp @@ -20,104 +20,14 @@ namespace program_common OS << "\n"; } - void insert_compare_op(std::ostream& OS) + void define_glsl_switches(std::ostream& OS, std::vector& enums) { - OS << - "bool comparison_passes(const in float a, const in float b, const in uint func)\n" - "{\n" - " switch (func)\n" - " {\n" - " default:\n" - " case 0: return false; //never\n" - " case 1: return (CMP_FIXUP(a) < CMP_FIXUP(b)); //less\n" - " case 2: return (CMP_FIXUP(a) == CMP_FIXUP(b)); //equal\n" - " case 3: return (CMP_FIXUP(a) <= CMP_FIXUP(b)); //lequal\n" - " case 4: return (CMP_FIXUP(a) > CMP_FIXUP(b)); //greater\n" - " case 5: return (CMP_FIXUP(a) != CMP_FIXUP(b)); //nequal\n" - " case 6: return (CMP_FIXUP(a) >= CMP_FIXUP(b)); //gequal\n" - " case 7: return true; //always\n" - " }\n" - "}\n\n"; - } - - void insert_compare_op_vector(std::ostream& OS) - { - OS << - "bvec4 comparison_passes(const in vec4 a, const in vec4 b, const in uint func)\n" - "{\n" - " switch (func)\n" - " {\n" - " default:\n" - " case 0: return bvec4(false); //never\n" - " case 1: return lessThan(CMP_FIXUP(a), CMP_FIXUP(b)); //less\n" - " case 2: return equal(CMP_FIXUP(a), CMP_FIXUP(b)); //equal\n" - " case 3: return lessThanEqual(CMP_FIXUP(a), CMP_FIXUP(b)); //lequal\n" - " case 4: return greaterThan(CMP_FIXUP(a), CMP_FIXUP(b)); //greater\n" - " case 5: return notEqual(CMP_FIXUP(a), CMP_FIXUP(b)); //nequal\n" - " case 6: return greaterThanEqual(CMP_FIXUP(a), CMP_FIXUP(b)); //gequal\n" - " case 7: return bvec4(true); //always\n" - " }\n" - "}\n\n"; - } - - void insert_fog_declaration(std::ostream& OS, std::string_view wide_vector_type, std::string_view input_coord) - { - define_glsl_constants(OS, - { - { "FOG_LINEAR", rsx::fog_mode::linear }, - { "FOG_EXP", rsx::fog_mode::exponential }, - { "FOG_EXP2", rsx::fog_mode::exponential2 }, - { "FOG_LINEAR_ABS", rsx::fog_mode::linear_abs }, - { "FOG_EXP_ABS", rsx::fog_mode::exponential_abs }, - { "FOG_EXP2_ABS", rsx::fog_mode::exponential2_abs } - }); - - std::string template_body = "$T fetch_fog_value(const in uint mode)\n"; - - template_body += - "{\n" - " $T result = $T($I.x, 0., 0., 0.);\n" - " switch(mode)\n" - " {\n" - " default:\n" - " return result;\n" - " case FOG_LINEAR:\n" - " //linear\n" - " result.y = fog_param1 * $I.x + (fog_param0 - 1.);\n" - " break;\n" - " case FOG_EXP:\n" - " //exponential\n" - " result.y = exp(11.084 * (fog_param1 * $I.x + fog_param0 - 1.5));\n" - " break;\n" - " case FOG_EXP2:\n" - " //exponential2\n" - " result.y = exp(-pow(4.709 * (fog_param1 * $I.x + fog_param0 - 1.5), 2.));\n" - " break;\n" - " case FOG_EXP_ABS:\n" - " //exponential_abs\n" - " result.y = exp(11.084 * (fog_param1 * abs($I.x) + fog_param0 - 1.5));\n" - " break;\n" - " case FOG_EXP2_ABS:\n" - " //exponential2_abs\n" - " result.y = exp(-pow(4.709 * (fog_param1 * abs($I.x) + fog_param0 - 1.5), 2.));\n" - " break;\n" - " case FOG_LINEAR_ABS:\n" - " //linear_abs\n" - " result.y = fog_param1 * abs($I.x) + (fog_param0 - 1.);\n" - " break;\n" - " }\n" - "\n" - " result.y = clamp(result.y, 0., 1.);\n" - " return result;\n" - "}\n\n"; - - std::pair replacements[] = + for (const auto& e : enums) { - std::make_pair("$T", std::string(wide_vector_type)), - std::make_pair("$I", std::string(input_coord)) - }; + OS << "#define " << e << "\n"; + } - OS << fmt::replace_all(template_body, replacements); + OS << "\n"; } } @@ -522,6 +432,17 @@ namespace glsl void insert_glsl_legacy_function(std::ostream& OS, const shader_properties& props) { + std::vector enabled_options; + if (props.low_precision_tests) + { + enabled_options.push_back("_GPU_LOW_PRECISION_COMPARE"); + } + + if (props.require_lit_emulation) + { + enabled_options.push_back("_ENABLE_LIT_EMULATION"); + } + OS << "#define _select mix\n"; OS << "#define _saturate(x) clamp(x, 0., 1.)\n"; OS << "#define _get_bits(x, off, count) bitfieldExtract(x, off, count)\n"; @@ -529,508 +450,182 @@ namespace glsl OS << "#define _test_bit(x, y) (_get_bits(x, y, 1) != 0)\n"; OS << "#define _rand(seed) fract(sin(dot(seed.xy, vec2(12.9898f, 78.233f))) * 43758.5453f)\n\n"; - if (props.low_precision_tests) - { - OS << "#define CMP_FIXUP(a) (sign(a) * 16. + a)\n\n"; - } - else - { - OS << "#define CMP_FIXUP(a) (a)\n\n"; - } if (props.domain == glsl::program_domain::glsl_fragment_program) { OS << "// ROP control\n"; - OS << "#define ALPHA_TEST_ENABLE_BIT " << rsx::ROP_control_bits::ALPHA_TEST_ENABLE_BIT << "\n"; - OS << "#define SRGB_FRAMEBUFFER_BIT " << rsx::ROP_control_bits::SRGB_FRAMEBUFFER_BIT << "\n"; - OS << "#define ALPHA_TO_COVERAGE_ENABLE_BIT " << rsx::ROP_control_bits::ALPHA_TO_COVERAGE_ENABLE_BIT << "\n"; - OS << "#define MSAA_WRITE_ENABLE_BIT " << rsx::ROP_control_bits::MSAA_WRITE_ENABLE_BIT << "\n"; - OS << "#define INT_FRAMEBUFFER_BIT " << rsx::ROP_control_bits::INT_FRAMEBUFFER_BIT << "\n"; - OS << "#define POLYGON_STIPPLE_ENABLE_BIT " << rsx::ROP_control_bits::POLYGON_STIPPLE_ENABLE_BIT << "\n"; - OS << "#define ALPHA_TEST_FUNC_OFFSET " << rsx::ROP_control_bits::ALPHA_FUNC_OFFSET << "\n"; - OS << "#define ALPHA_TEST_FUNC_LENGTH " << rsx::ROP_control_bits::ALPHA_FUNC_NUM_BITS << "\n"; - OS << "#define MSAA_SAMPLE_CTRL_OFFSET " << rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_OFFSET << "\n"; - OS << "#define MSAA_SAMPLE_CTRL_LENGTH " << rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_NUM_BITS << "\n"; - OS << "#define ROP_CMD_MASK " << rsx::ROP_control_bits::ROP_CMD_MASK << "\n\n"; - - // 8-bit rounding/quantization + program_common::define_glsl_constants(OS, { - const auto _16bit_outputs = (!props.fp32_outputs && props.supports_native_fp16); - const auto _255 = _16bit_outputs ? "f16vec4(255.)" : "vec4(255.)"; - const auto _1_over_2 = _16bit_outputs ? "f16vec4(0.5)" : "vec4(0.5)"; - OS << "#define round_to_8bit(v4) (floor(fma(v4, " << _255 << ", " << _1_over_2 << ")) / " << _255 << ")\n\n"; - } - - OS << "// Workaround for broken early discard in some drivers\n"; - if (props.disable_early_discard) - { - OS << "bool _fragment_discard = false;\n"; - OS << "#define _kill() _fragment_discard = true\n\n"; - } - else + { "ALPHA_TEST_ENABLE_BIT ", rsx::ROP_control_bits::ALPHA_TEST_ENABLE_BIT }, + { "SRGB_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::SRGB_FRAMEBUFFER_BIT }, + { "ALPHA_TO_COVERAGE_ENABLE_BIT ", rsx::ROP_control_bits::ALPHA_TO_COVERAGE_ENABLE_BIT }, + { "MSAA_WRITE_ENABLE_BIT ", rsx::ROP_control_bits::MSAA_WRITE_ENABLE_BIT }, + { "INT_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::INT_FRAMEBUFFER_BIT }, + { "POLYGON_STIPPLE_ENABLE_BIT ", rsx::ROP_control_bits::POLYGON_STIPPLE_ENABLE_BIT }, + { "ALPHA_TEST_FUNC_OFFSET ", rsx::ROP_control_bits::ALPHA_FUNC_OFFSET }, + { "ALPHA_TEST_FUNC_LENGTH ", rsx::ROP_control_bits::ALPHA_FUNC_NUM_BITS }, + { "MSAA_SAMPLE_CTRL_OFFSET ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_OFFSET }, + { "MSAA_SAMPLE_CTRL_LENGTH ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_NUM_BITS }, + { "ROP_CMD_MASK ", rsx::ROP_control_bits::ROP_CMD_MASK } + }); + + if (props.fp32_outputs || !props.supports_native_fp16) { - OS << "#define _kill() discard\n\n"; + enabled_options.push_back("_32_BIT_OUTPUT"); } - if (props.require_texture_ops) + if (props.disable_early_discard) { - // Declare special texture control flags - OS << "#define GAMMA_R_MASK (1 << " << rsx::texture_control_bits::GAMMA_R << ")\n"; - OS << "#define GAMMA_G_MASK (1 << " << rsx::texture_control_bits::GAMMA_G << ")\n"; - OS << "#define GAMMA_B_MASK (1 << " << rsx::texture_control_bits::GAMMA_B << ")\n"; - OS << "#define GAMMA_A_MASK (1 << " << rsx::texture_control_bits::GAMMA_A << ")\n"; - OS << "#define EXPAND_R_MASK (1 << " << rsx::texture_control_bits::EXPAND_R << ")\n"; - OS << "#define EXPAND_G_MASK (1 << " << rsx::texture_control_bits::EXPAND_G << ")\n"; - OS << "#define EXPAND_B_MASK (1 << " << rsx::texture_control_bits::EXPAND_B << ")\n"; - OS << "#define EXPAND_A_MASK (1 << " << rsx::texture_control_bits::EXPAND_A << ")\n\n"; - - OS << "#define ALPHAKILL " << rsx::texture_control_bits::ALPHAKILL << "\n"; - OS << "#define RENORMALIZE " << rsx::texture_control_bits::RENORMALIZE << "\n"; - OS << "#define DEPTH_FLOAT " << rsx::texture_control_bits::DEPTH_FLOAT << "\n"; - OS << "#define DEPTH_COMPARE " << rsx::texture_control_bits::DEPTH_COMPARE_OP << "\n"; - OS << "#define FILTERED_MAG_BIT " << rsx::texture_control_bits::FILTERED_MAG << "\n"; - OS << "#define FILTERED_MIN_BIT " << rsx::texture_control_bits::FILTERED_MIN << "\n"; - OS << "#define INT_COORDS_BIT " << rsx::texture_control_bits::UNNORMALIZED_COORDS << "\n"; - OS << "#define GAMMA_CTRL_MASK (GAMMA_R_MASK|GAMMA_G_MASK|GAMMA_B_MASK|GAMMA_A_MASK)\n"; - OS << "#define SIGN_EXPAND_MASK (EXPAND_R_MASK|EXPAND_G_MASK|EXPAND_B_MASK|EXPAND_A_MASK)\n"; - OS << "#define FILTERED_MASK (FILTERED_MAG_BIT|FILTERED_MIN_BIT)\n\n"; + enabled_options.push_back("_DISABLE_EARLY_DISCARD"); } } - if (props.require_lit_emulation) - { - OS << - "vec4 lit_legacy(const in vec4 val)" - "{\n" - " vec4 clamped_val = val;\n" - " clamped_val.x = max(val.x, 0.);\n" - " clamped_val.y = max(val.y, 0.);\n" - " vec4 result;\n" - " result.x = 1.;\n" - " result.w = 1.;\n" - " result.y = clamped_val.x;\n" - " result.z = clamped_val.x > 0. ? exp(clamped_val.w * log(max(clamped_val.y, 0.0000000001))) : 0.;\n" - " return result;\n" - "}\n\n"; - } + // Import common header + program_common::define_glsl_switches(OS, enabled_options); + enabled_options.clear(); + + OS << + #include "GLSLSnippets/RSXProg/RSXProgramCommon.glsl" + ; if (props.domain == glsl::program_domain::glsl_vertex_program) { if (props.require_explicit_invariance) { - // PS3 has shader invariance, but we don't really care about most attributes outside ATTR0 - OS << "invariant gl_Position;\n\n"; + enabled_options.push_back("_FORCE_POSITION_INVARIANCE"); } if (props.emulate_zclip_transform) { if (props.emulate_depth_clip_only) { - // Technically the depth value here is the 'final' depth that should be stored in the Z buffer. - // Forward mapping eqn is d' = d * (f - n) + n, where d' is the stored Z value (this) and d is the normalized API value. - OS << - "vec4 apply_zclip_xform(const in vec4 pos, const in float near_plane, const in float far_plane)\n" - "{\n" - " if (pos.w != 0.0)\n" - " {\n" - " const float real_n = min(far_plane, near_plane);\n" - " const float real_f = max(far_plane, near_plane);\n" - " const double depth_range = double(real_f - real_n);\n" - " const double inv_range = (depth_range > 0.000001) ? (1.0 / (depth_range * pos.w)) : 0.0;\n" - " const double actual_d = (double(pos.z) - double(real_n * pos.w)) * inv_range;\n" - " const double nearest_d = floor(actual_d + 0.5);\n" - " const double epsilon = (inv_range * pos.w) / 16777215.;\n" // Epsilon value is the minimum discernable change in Z that should affect the stored Z - " const double d = _select(actual_d, nearest_d, abs(actual_d - nearest_d) < epsilon);\n" - " return vec4(pos.xy, float(d * pos.w), pos.w);\n" - " }\n" - " else\n" - " {\n" - " return pos;\n" // Only values where Z=0 can ever pass this clip - " }\n" - "}\n\n"; + enabled_options.push_back("_EMULATE_ZCLIP_XFORM_STANDARD"); } else { - OS << - "vec4 apply_zclip_xform(const in vec4 pos, const in float near_plane, const in float far_plane)\n" - "{\n" - " float d = float(pos.z / pos.w);\n" - " if (d < 0.f && d >= near_plane)\n" - " {\n" - " // Clamp\n" - " d = 0.f;\n" - " }\n" - " else if (d > 1.f && d <= far_plane)\n" - " {\n" - " // Compress Z and store towards highest end of the range\n" - " d = min(1., 0.99 + (0.01 * (pos.z - near_plane) / (far_plane - near_plane)));\n" - " }\n" - " else\n" // This catch-call also handles w=0 since d=inf - " {\n" - " return pos;\n" - " }\n" - "\n" - " return vec4(pos.x, pos.y, d * pos.w, pos.w);\n" - "}\n\n"; + enabled_options.push_back("_EMULATE_ZCLIP_XFORM_FALLBACK"); } } + // Import vertex header + program_common::define_glsl_switches(OS, enabled_options); + + OS << + #include "GLSLSnippets/RSXProg/RSXVertexPrologue.glsl" + ; + return; } - program_common::insert_compare_op(OS); - if (props.emulate_coverage_tests) { - // Purely stochastic - OS << - "bool coverage_test_passes(const in vec4 _sample)\n" - "{\n" - " float random = _rand(gl_FragCoord);\n" - " return (_sample.a > random);\n" - "}\n\n"; + enabled_options.push_back("_EMULATE_COVERAGE_TEST"); } if (!props.fp32_outputs || props.require_linear_to_srgb) { - OS << - "vec4 linear_to_srgb(const in vec4 cl)\n" - "{\n" - " vec4 low = cl * 12.92;\n" - " vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n" - " bvec4 selection = lessThan(cl, vec4(0.0031308));\n" - " return clamp(mix(high, low, selection), 0., 1.);\n" - "}\n\n"; + enabled_options.push_back("_ENABLE_LINEAR_TO_SRGB"); } if (props.require_texture_ops || props.require_srgb_to_linear) { - OS << - "vec4 srgb_to_linear(const in vec4 cs)\n" - "{\n" - " vec4 a = cs / 12.92;\n" - " vec4 b = pow((cs + 0.055) / 1.055, vec4(2.4));\n" - " return _select(a, b, greaterThan(cs, vec4(0.04045)));\n" - "}\n\n"; + enabled_options.push_back("_ENABLE_SRGB_TO_LINEAR"); } - if (props.require_depth_conversion) + if (props.require_wpos) { - ensure(props.require_texture_ops); - - //NOTE: Memory layout is fetched as byteswapped BGRA [GBAR] (GOW collection, DS2, DeS) - //The A component (Z) is useless (should contain stencil8 or just 1) - OS << - "vec4 decode_depth24(const in float depth_value, const in bool depth_float)\n" - "{\n" - " uint value;\n" - " if (!depth_float)\n" - " value = uint(depth_value * 16777215.);\n" - " else\n" - " value = _get_bits(floatBitsToUint(depth_value), 7, 24);\n" - "\n" - " uint b = _get_bits(value, 0, 8);\n" - " uint g = _get_bits(value, 8, 8);\n" - " uint r = _get_bits(value, 16, 8);\n" - " return vec4(float(g)/255., float(b)/255., 1., float(r)/255.);\n" - "}\n\n" - - "vec4 remap_vector(const in vec4 color, const in uint remap)\n" - "{\n" - " vec4 result;\n" - " if (_get_bits(remap, 0, 8) == 0xE4)\n" - " {\n" - " result = color;\n" - " }\n" - " else\n" - " {\n" - " uvec4 remap_channel = uvec4(remap) >> uvec4(2, 4, 6, 0);\n" - " remap_channel &= 3;\n" - " remap_channel = (remap_channel + 3) % 4; // Map A-R-G-B to R-G-B-A\n\n" - - " // Generate remapped result\n" - " result.a = color[remap_channel.a];\n" - " result.r = color[remap_channel.r];\n" - " result.g = color[remap_channel.g];\n" - " result.b = color[remap_channel.b];\n" - " }\n\n" - - " if (_get_bits(remap, 8, 8) == 0xAA)\n" - " return result;\n\n" - - " uvec4 remap_select = uvec4(remap) >> uvec4(10, 12, 14, 8);\n" - " remap_select &= 3;\n" - " bvec4 choice = lessThan(remap_select, uvec4(2));\n" - " return _select(result, vec4(remap_select), choice);\n" - "}\n\n" + enabled_options.push_back("_ENABLE_WPOS"); + } - "vec4 convert_z24x8_to_rgba8(const in vec2 depth_stencil, const in uint remap, const in uint flags)\n" - "{\n" - " vec4 result = decode_depth24(depth_stencil.x, _test_bit(flags, DEPTH_FLOAT));\n" - " result.z = depth_stencil.y / 255.;\n\n" + if (props.require_fog_read) + { + program_common::define_glsl_constants(OS, + { + { "FOG_LINEAR", rsx::fog_mode::linear }, + { "FOG_EXP", rsx::fog_mode::exponential }, + { "FOG_EXP2", rsx::fog_mode::exponential2 }, + { "FOG_LINEAR_ABS", rsx::fog_mode::linear_abs }, + { "FOG_EXP_ABS", rsx::fog_mode::exponential_abs }, + { "FOG_EXP2_ABS", rsx::fog_mode::exponential2_abs }, + }); + + enabled_options.push_back("_ENABLE_FOG_READ"); + } - " if (remap == 0xAAE4)\n" - " return result;\n\n" + // Import fragment header + program_common::define_glsl_switches(OS, enabled_options); + enabled_options.clear(); - " return remap_vector(result, remap);\n" - "}\n\n"; - } + OS << + #include "GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl" + ; if (props.require_texture_ops) { - OS << - - //TODO: Move all the texture read control operations here - "vec4 process_texel(in vec4 rgba, const in uint control_bits)\n" - "{\n" - " if (control_bits == 0)\n" - " {\n" - " return rgba;\n" - " }\n" - "\n" - " if (_test_bit(control_bits, ALPHAKILL))\n" - " {\n" - " // Alphakill\n" - " if (rgba.a < 0.000001)\n" - " {\n" - " _kill();\n" - " return rgba;\n" - " }\n" - " }\n" - "\n" - " if (_test_bit(control_bits, RENORMALIZE))\n" - " {\n" - " // Renormalize to 8-bit (PS3) accuracy\n" - " rgba = floor(rgba * 255.);\n" - " rgba /= 255.;\n" - " }\n" - "\n" - " uvec4 mask;\n" - " vec4 convert;\n" - " uint op_mask = control_bits & uint(SIGN_EXPAND_MASK);\n" - "\n" - " if (op_mask != 0)\n" - " {\n" - " // Expand to signed normalized\n" - " mask = uvec4(op_mask) & uvec4(EXPAND_R_MASK, EXPAND_G_MASK, EXPAND_B_MASK, EXPAND_A_MASK);\n" - " convert = (rgba * 2.f - 1.f);\n" - " rgba = _select(rgba, convert, notEqual(mask, uvec4(0)));\n" - " }\n" - "\n" - " op_mask = control_bits & uint(GAMMA_CTRL_MASK);\n" - " if (op_mask != 0u)\n" - " {\n" - " // Gamma correction\n" - " mask = uvec4(op_mask) & uvec4(GAMMA_R_MASK, GAMMA_G_MASK, GAMMA_B_MASK, GAMMA_A_MASK);\n" - " convert = srgb_to_linear(rgba);\n" - " return _select(rgba, convert, notEqual(mask, uvec4(0)));\n" - " }\n" - "\n" - " return rgba;\n" - "}\n\n"; + // Declare special texture control flags + OS << "#define GAMMA_R_MASK (1 << " << rsx::texture_control_bits::GAMMA_R << ")\n"; + OS << "#define GAMMA_G_MASK (1 << " << rsx::texture_control_bits::GAMMA_G << ")\n"; + OS << "#define GAMMA_B_MASK (1 << " << rsx::texture_control_bits::GAMMA_B << ")\n"; + OS << "#define GAMMA_A_MASK (1 << " << rsx::texture_control_bits::GAMMA_A << ")\n"; + OS << "#define EXPAND_R_MASK (1 << " << rsx::texture_control_bits::EXPAND_R << ")\n"; + OS << "#define EXPAND_G_MASK (1 << " << rsx::texture_control_bits::EXPAND_G << ")\n"; + OS << "#define EXPAND_B_MASK (1 << " << rsx::texture_control_bits::EXPAND_B << ")\n"; + OS << "#define EXPAND_A_MASK (1 << " << rsx::texture_control_bits::EXPAND_A << ")\n\n"; + + OS << "#define ALPHAKILL " << rsx::texture_control_bits::ALPHAKILL << "\n"; + OS << "#define RENORMALIZE " << rsx::texture_control_bits::RENORMALIZE << "\n"; + OS << "#define DEPTH_FLOAT " << rsx::texture_control_bits::DEPTH_FLOAT << "\n"; + OS << "#define DEPTH_COMPARE " << rsx::texture_control_bits::DEPTH_COMPARE_OP << "\n"; + OS << "#define FILTERED_MAG_BIT " << rsx::texture_control_bits::FILTERED_MAG << "\n"; + OS << "#define FILTERED_MIN_BIT " << rsx::texture_control_bits::FILTERED_MIN << "\n"; + OS << "#define INT_COORDS_BIT " << rsx::texture_control_bits::UNNORMALIZED_COORDS << "\n"; + OS << "#define GAMMA_CTRL_MASK (GAMMA_R_MASK|GAMMA_G_MASK|GAMMA_B_MASK|GAMMA_A_MASK)\n"; + OS << "#define SIGN_EXPAND_MASK (EXPAND_R_MASK|EXPAND_G_MASK|EXPAND_B_MASK|EXPAND_A_MASK)\n"; + OS << "#define FILTERED_MASK (FILTERED_MAG_BIT|FILTERED_MIN_BIT)\n\n"; if (props.require_texture_expand) { - OS << - "uint _texture_flag_override = 0;\n" - "#define _enable_texture_expand() _texture_flag_override = SIGN_EXPAND_MASK\n" - "#define _disable_texture_expand() _texture_flag_override = 0\n" - "#define TEX_FLAGS(index) (texture_parameters[index].flags | _texture_flag_override)\n"; + enabled_options.push_back("_ENABLE_TEXTURE_EXPAND"); } - else - { - OS << - "#define TEX_FLAGS(index) texture_parameters[index].flags\n"; - } - - OS << - "#define TEX_NAME(index) tex##index\n" - "#define TEX_NAME_STENCIL(index) tex##index##_stencil\n\n" - - "#define COORD_SCALE1(index, coord1) ((coord1 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.x)\n" - "#define COORD_SCALE2(index, coord2) ((coord2 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xy)\n" - "#define COORD_SCALE3(index, coord3) ((coord3 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xyz)\n\n" - - "#define TEX1D(index, coord1) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1)), TEX_FLAGS(index))\n" - "#define TEX1D_BIAS(index, coord1, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1), bias), TEX_FLAGS(index))\n" - "#define TEX1D_LOD(index, coord1, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE1(index, coord1), lod), TEX_FLAGS(index))\n" - "#define TEX1D_GRAD(index, coord1, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE1(index, coord1), dpdx, dpdy), TEX_FLAGS(index))\n" - "#define TEX1D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec2(COORD_SCALE1(index, coord4.x), coord4.w)), TEX_FLAGS(index))\n" - - "#define TEX2D(index, coord2) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2)), TEX_FLAGS(index))\n" - "#define TEX2D_BIAS(index, coord2, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2), bias), TEX_FLAGS(index))\n" - "#define TEX2D_LOD(index, coord2, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE2(index, coord2), lod), TEX_FLAGS(index))\n" - "#define TEX2D_GRAD(index, coord2, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE2(index, coord2), dpdx, dpdy), TEX_FLAGS(index))\n" - "#define TEX2D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.z, coord4.w)), TEX_FLAGS(index))\n\n"; if (props.emulate_shadow_compare) { - OS << - "#define SHADOW_COORD(index, coord3) vec3(COORD_SCALE2(index, coord3.xy), _test_bit(TEX_FLAGS(index), DEPTH_FLOAT)? coord3.z : min(float(coord3.z), 1.0))\n" - "#define SHADOW_COORD4(index, coord4) vec4(SHADOW_COORD(index, coord4.xyz), coord4.w)\n" - "#define SHADOW_COORD_PROJ(index, coord4) vec4(COORD_SCALE2(index, coord4.xy), _test_bit(TEX_FLAGS(index), DEPTH_FLOAT)? coord4.z : min(coord4.z, coord4.w), coord4.w)\n\n" - - "#define TEX2D_SHADOW(index, coord3) texture(TEX_NAME(index), SHADOW_COORD(index, coord3))\n" - "#define TEX3D_SHADOW(index, coord4) texture(TEX_NAME(index), SHADOW_COORD4(index, coord4))\n" - "#define TEX2D_SHADOWPROJ(index, coord4) textureProj(TEX_NAME(index), SHADOW_COORD_PROJ(index, coord4))\n"; - } - else - { - OS << - "#define TEX2D_SHADOW(index, coord3) texture(TEX_NAME(index), vec3(COORD_SCALE2(index, coord3.xy), coord3.z))\n" - "#define TEX3D_SHADOW(index, coord4) texture(TEX_NAME(index), vec4(COORD_SCALE3(index, coord4.xyz), coord4.w))\n" - "#define TEX2D_SHADOWPROJ(index, coord4) textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.zw))\n"; + enabled_options.push_back("_EMULATED_TEXSHADOW"); } + program_common::define_glsl_switches(OS, enabled_options); + enabled_options.clear(); + OS << - "#define TEX3D(index, coord3) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3)), TEX_FLAGS(index))\n" - "#define TEX3D_BIAS(index, coord3, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3), bias), TEX_FLAGS(index))\n" - "#define TEX3D_LOD(index, coord3, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE3(index, coord3), lod), TEX_FLAGS(index))\n" - "#define TEX3D_GRAD(index, coord3, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE3(index, coord3), dpdx, dpdy), TEX_FLAGS(index))\n" - "#define TEX3D_PROJ(index, coord4) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord4.xyz) / coord4.w), TEX_FLAGS(index))\n\n"; + #include "GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl" + ; if (props.require_depth_conversion) { OS << - "#define ZS_READ(index, coord) vec2(texture(TEX_NAME(index), coord).r, float(texture(TEX_NAME_STENCIL(index), coord).x))\n" - "#define TEX1D_Z24X8_RGBA8(index, coord1) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE1(index, coord1)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n" - "#define TEX2D_Z24X8_RGBA8(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE2(index, coord2)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n" - "#define TEX3D_Z24X8_RGBA8(index, coord3) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE3(index, coord3)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n\n"; + #include "GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl" + ; } if (props.require_msaa_ops) { OS << - "#define ZCOMPARE_FUNC(index) _get_bits(TEX_FLAGS(index), DEPTH_COMPARE, 3)\n" - "#define ZS_READ_MS(index, coord) vec2(sampleTexture2DMS(TEX_NAME(index), coord, index).r, float(sampleTexture2DMS(TEX_NAME_STENCIL(index), coord, index).x))\n" - "#define TEX2D_MS(index, coord2) process_texel(sampleTexture2DMS(TEX_NAME(index), coord2, index), TEX_FLAGS(index))\n" - "#define TEX2D_SHADOW_MS(index, coord3) vec4(comparison_passes(sampleTexture2DMS(TEX_NAME(index), coord3.xy, index).x, coord3.z, ZCOMPARE_FUNC(index)))\n" - "#define TEX2D_SHADOWPROJ_MS(index, coord4) TEX2D_SHADOW_MS(index, (coord4.xyz / coord4.w))\n" - "#define TEX2D_Z24X8_RGBA8_MS(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n\n"; + #include "GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl" + ; - OS << - "vec3 compute2x2DownsampleWeights(const in float coord, const in float uv_step, const in float actual_step)" - "{\n" - " const float next_sample_point = coord + actual_step;\n" - " const float next_coord_step = fma(floor(coord / uv_step), uv_step, uv_step);\n" - " const float next_coord_step_plus_one = next_coord_step + uv_step;\n" - " vec3 weights = vec3(next_coord_step, min(next_coord_step_plus_one, next_sample_point), max(next_coord_step_plus_one, next_sample_point)) - vec3(coord, next_coord_step, next_coord_step_plus_one);\n" - " return weights / actual_step;\n" - "}\n\n"; - - auto insert_msaa_sample_code = [&OS](const std::string_view& sampler_type) - { - OS << - "vec4 texelFetch2DMS(in " << sampler_type << " tex, const in vec2 sample_count, const in ivec2 icoords, const in int index, const in ivec2 offset)\n" - "{\n" - " const vec2 resolve_coords = vec2(icoords + offset);\n" - " const vec2 aa_coords = floor(resolve_coords / sample_count);\n" // AA coords = real_coords / sample_count - " const vec2 sample_loc = fma(aa_coords, -sample_count, resolve_coords);\n" // Sample ID = real_coords % sample_count - " const float sample_index = fma(sample_loc.y, sample_count.y, sample_loc.x);\n" - " return texelFetch(tex, ivec2(aa_coords), int(sample_index));\n" - "}\n\n" - - "vec4 sampleTexture2DMS(in " << sampler_type << " tex, const in vec2 coords, const in int index)\n" - "{\n" - " const uint flags = TEX_FLAGS(index);\n" - " const vec2 normalized_coords = COORD_SCALE2(index, coords);\n" - " const vec2 sample_count = vec2(2., textureSamples(tex) * 0.5);\n" - " const vec2 image_size = textureSize(tex) * sample_count;\n" - " const ivec2 icoords = ivec2(normalized_coords * image_size);\n" - " const vec4 sample0 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(0));\n" - "\n" - " if (_get_bits(flags, FILTERED_MAG_BIT, 2) == 0)\n" - " {\n" - " return sample0;\n" - " }\n" - "\n" - " // Bilinear scaling, with upto 2x2 downscaling with simple weights\n" - " const vec2 uv_step = 1.0 / vec2(image_size);\n" - " const vec2 actual_step = vec2(dFdx(normalized_coords.x), dFdy(normalized_coords.y));\n" - "\n" - " const bvec2 no_filter = lessThan(abs(uv_step - actual_step), vec2(0.000001));\n" - " if (no_filter.x && no_filter.y)\n" - " {\n" - " return sample0;\n" - " }\n" - "\n" - " vec4 a, b;\n" - " float factor;\n" - " const vec4 sample2 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(0, 1)); // Top left\n" - "\n" - " if (no_filter.x)\n" - " {\n" - " // No scaling, 1:1\n" - " a = sample0;\n" - " b = sample2;\n" - " }\n" - " else\n" - " {\n" - " // Filter required, sample more data\n" - " const vec4 sample1 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(1, 0)); // Bottom right\n" - " const vec4 sample3 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(1, 1)); // Top right\n" - "\n" - " if (actual_step.x > uv_step.x)\n" - " {\n" - " // Downscale in X, centered\n" - " const vec3 weights = compute2x2DownsampleWeights(normalized_coords.x, uv_step.x, actual_step.x);\n" - "\n" - " const vec4 sample4 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(2, 0)); // Further bottom right\n" - " a = fma(sample0, weights.xxxx, sample1 * weights.y) + (sample4 * weights.z); // Weighted sum\n" - "\n" - " if (!no_filter.y)\n" - " {\n" - " const vec4 sample5 = texelFetch2DMS(tex, sample_count, icoords, index, ivec2(2, 1)); // Further top right\n" - " b = fma(sample2, weights.xxxx, sample3 * weights.y) + (sample5 * weights.z); // Weighted sum\n" - " }\n" - " }\n" - " else if (actual_step.x < uv_step.x)\n" - " {\n" - " // Upscale in X\n" - " factor = fract(normalized_coords.x * image_size.x);\n" - " a = mix(sample0, sample1, factor);\n" - " b = mix(sample2, sample3, factor);\n" - " }\n" - " }\n" - "\n" - " if (no_filter.y)\n" - " {\n" - " // 1:1 no scale\n" - " return a;\n" - " }\n" - " else if (actual_step.y > uv_step.y)\n" - " {\n" - " // Downscale in Y\n" - " const vec3 weights = compute2x2DownsampleWeights(normalized_coords.y, uv_step.y, actual_step.y);\n" - " // We only have 2 rows computed for performance reasons, so combine rows 1 and 2\n" - " return a * weights.x + b * (weights.y + weights.z);\n" - " }\n" - " else if (actual_step.y < uv_step.y)\n" - " {\n" - " // Upscale in Y\n" - " factor = fract(normalized_coords.y * image_size.y);\n" - " return mix(a, b, factor);\n" - " }\n" - "}\n\n"; - }; - - insert_msaa_sample_code("sampler2DMS"); + // Generate multiple versions of the actual sampler code. + // We could use defines to generate these, but I don't trust some OpenGL compilers to do the right thing. + const std::string_view msaa_sampling_impl = + #include "GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOpsInternal.glsl" + ; + + OS << fmt::replace_all(msaa_sampling_impl, "_MSAA_SAMPLER_TYPE_", "sampler2DMS"); if (props.require_depth_conversion) { - insert_msaa_sample_code("usampler2DMS"); + OS << fmt::replace_all(msaa_sampling_impl, "_MSAA_SAMPLER_TYPE_", "usampler2DMS"); } } } - - if (props.require_wpos) - { - OS << - "vec4 get_wpos()\n" - "{\n" - " float abs_scale = abs(wpos_scale);\n" - " return (gl_FragCoord * vec4(abs_scale, wpos_scale, 1., 1.)) + vec4(0., wpos_bias, 0., 0.);\n" - "}\n\n"; - } } std::string getFunctionImpl(FUNCTION f) @@ -1152,12 +747,8 @@ namespace glsl // Global types and stuff // Must be compatible with std140 packing rules OS << - "struct sampler_info\n" - "{\n" - " vec4 scale_bias;\n" - " uint remap;\n" - " uint flags;\n" - "};\n\n"; + #include "GLSLSnippets/RSXProg/RSXDefines2.glsl" + ; } void insert_fragment_shader_inputs_block( diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.h b/rpcs3/Emu/RSX/Program/GLSLCommon.h index 4346b3d5fed4..dba5c1c63034 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.h +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.h @@ -76,13 +76,6 @@ namespace rsx }; } -namespace program_common -{ - void insert_compare_op(std::ostream& OS, bool low_precision); - void insert_compare_op_vector(std::ostream& OS); - void insert_fog_declaration(std::ostream& OS, std::string_view vector_type = "vec4", std::string_view input_coord = "fog_c"); -} - namespace glsl { struct two_sided_lighting_config diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl index b3c98e087034..01e83f3c2d3f 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl @@ -1,5 +1,4 @@ R"( - #ifdef _32_BIT_OUTPUT // Default. Used when we're not utilizing native fp16 #define round_to_8bit(v4) (floor(fma(v4, vec4(255.), vec4(0.5))) / vec4(255.)) @@ -9,9 +8,10 @@ R"( #endif #ifdef _DISABLE_EARLY_DISCARD -#define kill() _fragment_discard = true +bool _fragment_discard = false; +#define _kill() _fragment_discard = true #else -#define kill() discard +#define _kill() discard #endif #ifdef _ENABLE_WPOS @@ -22,6 +22,73 @@ vec4 get_wpos() } #endif +#ifdef _ENABLE_FOG_READ +vec4 fetch_fog_value(const in uint mode) +{ + vec4 result = vec4(fog_c.x, 0., 0., 0.); + switch(mode) + { + default: + return result; + case FOG_LINEAR: + // linear + result.y = fog_param1 * fog_c.x + (fog_param0 - 1.); + break; + case FOG_EXP: + // exponential + result.y = exp(11.084 * (fog_param1 * fog_c.x + fog_param0 - 1.5)); + break; + case FOG_EXP2: + // exponential2 + result.y = exp(-pow(4.709 * (fog_param1 * fog_c.x + fog_param0 - 1.5), 2.)); + break; + case FOG_EXP_ABS: + // exponential_abs + result.y = exp(11.084 * (fog_param1 * abs(fog_c.x) + fog_param0 - 1.5)); + break; + case FOG_EXP2_ABS: + // exponential2_abs + result.y = exp(-pow(4.709 * (fog_param1 * abs(fog_c.x) + fog_param0 - 1.5), 2.)); + break; + case FOG_LINEAR_ABS: + // linear_abs + result.y = fog_param1 * abs(fog_c.x) + (fog_param0 - 1.); + break; + } + + result.y = clamp(result.y, 0., 1.); + return result; +} +#endif + +#ifdef _EMULATE_COVERAGE_TEST +// Purely stochastic +bool coverage_test_passes(const in vec4 _sample) +{ + float random = _rand(gl_FragCoord); + return (_sample.a > random); +} +#endif + +#ifdef _ENABLE_LINEAR_TO_SRGB +vec4 linear_to_srgb(const in vec4 cl) +{ + vec4 low = cl * 12.92; + vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055; + bvec4 selection = lessThan(cl, vec4(0.0031308)); + return clamp(mix(high, low, selection), 0., 1.); +} +#endif + +#ifdef _ENABLE_SRGB_TO_LINEAR +vec4 srgb_to_linear(const in vec4 cs) +{ + vec4 a = cs / 12.92; + vec4 b = pow((cs + 0.055) / 1.055, vec4(2.4)); + return _select(a, b, greaterThan(cs, vec4(0.04045))); +} +#endif + // Required by all fragment shaders for alpha test bool comparison_passes(const in float a, const in float b, const in uint func) { diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl index 887212f1d2cf..4e9574a4e55e 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXProgramCommon.glsl @@ -24,4 +24,4 @@ vec4 lit_legacy(const in vec4 val) } #endif -)" \ No newline at end of file +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl index 23e272625fc0..334280265215 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl @@ -1,5 +1,6 @@ R"( #ifdef _FORCE_POSITION_INVARIANCE +// PS3 has shader invariance, but we don't really care about most attributes outside ATTR0 invariant gl_Position; #endif @@ -54,5 +55,4 @@ vec4 apply_zclip_xform( }\n #endif - )" diff --git a/rpcs3/Emu/RSX/Program/GLSLTypes.h b/rpcs3/Emu/RSX/Program/GLSLTypes.h index 7bfb84f8ce3e..4e31a369efef 100644 --- a/rpcs3/Emu/RSX/Program/GLSLTypes.h +++ b/rpcs3/Emu/RSX/Program/GLSLTypes.h @@ -32,6 +32,7 @@ namespace glsl bool require_srgb_to_linear : 1; bool require_linear_to_srgb : 1; bool require_explicit_invariance: 1; + bool require_fog_read : 1; bool emulate_coverage_tests : 1; bool emulate_shadow_compare : 1; bool emulate_zclip_transform : 1; diff --git a/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp b/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp index 412b939fe78f..dd7a83d8cd9b 100644 --- a/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp @@ -243,6 +243,7 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.require_texture_expand = properties.has_exp_tex_op; m_shader_props.require_srgb_to_linear = properties.has_upg; m_shader_props.require_linear_to_srgb = properties.has_pkg; + m_shader_props.require_fog_read = properties.in_register_mask & in_fogc; m_shader_props.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none; m_shader_props.emulate_shadow_compare = device_props.emulate_depth_compare; m_shader_props.low_precision_tests = device_props.has_low_precision_rounding && !(m_prog.ctrl & RSX_SHADER_CONTROL_ATTRIBUTE_INTERPOLATION); @@ -255,9 +256,6 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) void VKFragmentDecompilerThread::insertMainStart(std::stringstream & OS) { - if (properties.in_register_mask & in_fogc) - program_common::insert_fog_declaration(OS); - std::set output_registers; if (m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) { From d789ec84e4a91750eb09fbee6de69fa4d2a35182 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 01:40:35 +0300 Subject: [PATCH 03/25] rsx: Migrate vertex fetch out of the cpp file --- rpcs3/Emu/RSX/Program/GLSLCommon.cpp | 204 +---------------- .../GLSLSnippets/RSXProg/RSXVertexFetch.glsl | 209 ++++++++++++++++++ rpcs3/emucore.vcxproj | 1 + rpcs3/emucore.vcxproj.filters | 3 + 4 files changed, 216 insertions(+), 201 deletions(-) create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp index 6a511b56ca2e..369933cf2a3b 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp @@ -129,210 +129,12 @@ namespace glsl // Note: Tested on Mesa iris with HD 530 and compilant path works fine, may be a bug on Windows proprietary drivers if (!glsl4_compliant) { - OS << - "void mov(inout uvec4 vector, const in int index, const in uint scalar)\n" - "{\n" - " switch(index)\n" - " {\n" - " case 0: vector.x = scalar; return;\n" - " case 1: vector.y = scalar; return;\n" - " case 2: vector.z = scalar; return;\n" - " case 3: vector.w = scalar; return;\n" - " }\n" - "}\n\n" - - "uint ref(const in uvec4 vector, const in int index)\n" - "{\n" - " switch(index)\n" - " {\n" - " case 0: return vector.x;\n" - " case 1: return vector.y;\n" - " case 2: return vector.z;\n" - " case 3: return vector.w;\n" - " }\n" - "}\n\n"; - } - else - { - OS << - "#define mov(v, i, s) v[i] = s\n" - "#define ref(v, i) v[i]\n\n"; + OS << "#define _INTEL_GLSL\n"; } OS << - "struct attribute_desc\n" - "{\n" - " uint type;\n" - " uint attribute_size;\n" - " uint starting_offset;\n" - " uint stride;\n" - " uint frequency;\n" - " bool swap_bytes;\n" - " bool is_volatile;\n" - " bool modulo;\n" - "};\n\n" - - "uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap)\n" - "{\n" - " return (swap) ?\n" - " _set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n" - " _set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8);\n" - "}\n\n" - - "uint gen_bits(const in uint x, const in uint y, const in bool swap)\n" - "{\n" - " return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8);\n" - "}\n\n" - - // NOTE: (int(n) or int(n)) is broken on some NVIDIA and INTEL hardware when the sign bit is involved. - // See https://github.com/RPCS3/rpcs3/issues/8990 - "vec4 sext(const in ivec4 bits)\n" - "{\n" - " // convert raw 16 bit values into signed 32-bit float4 counterpart\n" - " bvec4 sign_check = lessThan(bits, ivec4(0x8000));\n" - " return _select(bits - 65536, bits, sign_check);\n" - "}\n\n" - - "float sext(const in int bits)\n" - "{\n" - " return (bits < 0x8000) ? float(bits) : float(bits - 65536); \n" - "}\n\n" - - "vec4 fetch_attribute(const in attribute_desc desc, const in int vertex_id, usamplerBuffer input_stream)\n" - "{\n" - " const int elem_size_table[] = { 0, 2, 4, 2, 1, 2, 4, 1 };\n" - " const float scaling_table[] = { 1., 32767.5, 1., 1., 255., 1., 32767., 1. };\n" - " const int elem_size = elem_size_table[desc.type];\n" - " const vec4 scale = scaling_table[desc.type].xxxx;\n\n" - - " uvec4 tmp, result = uvec4(0u);\n" - " vec4 ret;\n" - " int n, i = int((vertex_id * desc.stride) + desc.starting_offset);\n\n" - - " for (n = 0; n < desc.attribute_size; n++)\n" - " {\n" - " tmp.x = texelFetch(input_stream, i++).x;\n" - " if (elem_size == 2)\n" - " {\n" - " tmp.y = texelFetch(input_stream, i++).x;\n" - " tmp.x = gen_bits(tmp.x, tmp.y, desc.swap_bytes);\n" - " }\n" - " else if (elem_size == 4)\n" - " {\n" - " tmp.y = texelFetch(input_stream, i++).x;\n" - " tmp.z = texelFetch(input_stream, i++).x;\n" - " tmp.w = texelFetch(input_stream, i++).x;\n" - " tmp.x = gen_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes);\n" - " }\n\n" - - " mov(result, n, tmp.x);\n" - " }\n\n" - - " // Actual decoding step is done in vector space, outside the loop\n" - " if (desc.type == VTX_FMT_SNORM16 || desc.type == VTX_FMT_SINT16)\n" - " {\n" - " ret = sext(ivec4(result));\n" - " ret = fma(vec4(0.5), vec4(desc.type == VTX_FMT_SNORM16), ret);\n" - " }\n" - " else if (desc.type == VTX_FMT_FLOAT32)\n" - " {\n" - " ret = uintBitsToFloat(result);\n" - " }\n" - " else if (desc.type == VTX_FMT_FLOAT16)\n" - " {\n" - " tmp.x = _set_bits(result.x, result.y, 16, 16);\n" - " tmp.y = _set_bits(result.z, result.w, 16, 16);\n" - " ret.xy = unpackHalf2x16(tmp.x);\n" - " ret.zw = unpackHalf2x16(tmp.y);\n" - " }\n" - " else if (elem_size == 1) //(desc.type == VTX_FMT_UINT8 || desc.type == VTX_FMT_UNORM8)\n" - " {\n" - " // Ignore bswap on single byte channels\n" - " ret = vec4(result);\n" - " }\n" - " else //if (desc.type == VTX_FMT_COMP32)\n" - " {\n" - " result = uvec4(_get_bits(result.x, 0, 11),\n" - " _get_bits(result.x, 11, 11),\n" - " _get_bits(result.x, 22, 10),\n" - " uint(scale.x));\n" - " ret = sext(ivec4(result) << ivec4(5, 5, 6, 0));\n" - " }\n\n" - - " if (desc.attribute_size < 4)\n" - " {\n" - " ret.w = scale.x;\n" - " }\n\n" - - " return ret / scale; \n" - "}\n\n" - - "attribute_desc fetch_desc(const in int location)\n" - "{\n" - " // Each descriptor is 64 bits wide\n" - " // [0-8] attribute stride\n" - " // [8-24] attribute divisor\n" - " // [24-27] attribute type\n" - " // [27-30] attribute size\n" - " // [30-31] reserved\n" - " // [32-60] starting offset\n" - " // [60-61] swap bytes flag\n" - " // [61-62] volatile flag\n" - " // [62-63] modulo enable flag\n\n"; - - if (rules == glsl_rules_opengl4) - { - // Data is packed into a ubo - OS << - " int block = (location >> 1);\n" - " int sub_block = (location & 1) << 1;\n" - " uvec2 attrib = uvec2(\n" - " ref(input_attributes_blob[block], sub_block + 0),\n" - " ref(input_attributes_blob[block], sub_block + 1));\n\n"; - } - else - { - // Fetch parameters streamed separately from draw parameters - OS << - " uvec2 attrib = texelFetch(vertex_layout_stream, location + int(layout_ptr_offset)).xy;\n\n"; - } - - OS << - " attribute_desc result;\n" - " result.stride = _get_bits(attrib.x, 0, 8);\n" - " result.frequency = _get_bits(attrib.x, 8, 16);\n" - " result.type = _get_bits(attrib.x, 24, 3);\n" - " result.attribute_size = _get_bits(attrib.x, 27, 3);\n" - " result.starting_offset = _get_bits(attrib.y, 0, 29);\n" - " result.swap_bytes = _test_bit(attrib.y, 29);\n" - " result.is_volatile = _test_bit(attrib.y, 30);\n" - " result.modulo = _test_bit(attrib.y, 31);\n" - " return result;\n" - "}\n\n" - - "vec4 read_location(const in int location)\n" - "{\n" - " attribute_desc desc = fetch_desc(location);\n" - " int vertex_id = " << vertex_id_name << " - int(vertex_base_index);\n" - " if (desc.frequency == 0)\n" - " {\n" - " vertex_id = 0;\n" - " }\n" - " else if (desc.modulo)\n" - " {\n" - " //if a vertex modifier is active; vertex_base must be 0 and is ignored\n" - " vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n" - " }\n" - " else\n" - " {\n" - " vertex_id /= int(desc.frequency); \n" - " }\n\n" - - " if (desc.is_volatile)\n" - " return fetch_attribute(desc, vertex_id, volatile_input_stream);\n" - " else\n" - " return fetch_attribute(desc, vertex_id, persistent_input_stream);\n" - "}\n\n"; + #include "GLSLSnippets/RSXProg/RSXVertexFetch.glsl" + ; } void insert_rop_init(std::ostream& OS) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl new file mode 100644 index 000000000000..c0be5e8786fd --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl @@ -0,0 +1,209 @@ +R"( +#ifdef _INTEL_GLSL +// For intel GPUs which cannot access vectors in indexed mode (driver bug? or glsl version too low?) +// Note: Tested on Mesa iris with HD 530 and compilant path works fine, may be a bug on Windows proprietary drivers +void mov(inout uvec4 vector, const in int index, const in uint scalar) +{ + switch(index) + { + case 0: vector.x = scalar; return; + case 1: vector.y = scalar; return; + case 2: vector.z = scalar; return; + case 3: vector.w = scalar; return; + } +} + +uint ref(const in uvec4 vector, const in int index) +{ + switch(index) + { + case 0: return vector.x; + case 1: return vector.y; + case 2: return vector.z; + case 3: return vector.w; + } +} +#else +#define mov(v, i, s) v[i] = s +#define ref(v, i) v[i] +#endif + +#ifdef VULKAN +#define _gl_VertexID gl_VertexIndex +#else +#define _gl_VertexID gl_VertexID +#endif + +struct attribute_desc +{ + uint type; + uint attribute_size; + uint starting_offset; + uint stride; + uint frequency; + bool swap_bytes; + bool is_volatile; + bool modulo; +}; + +uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap) +{ + return (swap) ? + _set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) : + _set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8); +} + +uint gen_bits(const in uint x, const in uint y, const in bool swap) +{ + return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8); +} + +// NOTE: (int(n) or int(n)) is broken on some NVIDIA and INTEL hardware when the sign bit is involved. +// See https://github.com/RPCS3/rpcs3/issues/8990 +vec4 sext(const in ivec4 bits) +{ + // convert raw 16 bit values into signed 32-bit float4 counterpart + bvec4 sign_check = lessThan(bits, ivec4(0x8000)); + return _select(bits - 65536, bits, sign_check); +} + +float sext(const in int bits) +{ + return (bits < 0x8000) ? float(bits) : float(bits - 65536); +} + +vec4 fetch_attribute(const in attribute_desc desc, const in int vertex_id, usamplerBuffer input_stream) +{ + const int elem_size_table[] = { 0, 2, 4, 2, 1, 2, 4, 1 }; + const float scaling_table[] = { 1., 32767.5, 1., 1., 255., 1., 32767., 1. }; + const int elem_size = elem_size_table[desc.type]; + const vec4 scale = scaling_table[desc.type].xxxx; + + uvec4 tmp, result = uvec4(0u); + vec4 ret; + int n, i = int((vertex_id * desc.stride) + desc.starting_offset); + + for (n = 0; n < desc.attribute_size; n++) + { + tmp.x = texelFetch(input_stream, i++).x; + if (elem_size == 2) + { + tmp.y = texelFetch(input_stream, i++).x; + tmp.x = gen_bits(tmp.x, tmp.y, desc.swap_bytes); + } + else if (elem_size == 4) + { + tmp.y = texelFetch(input_stream, i++).x; + tmp.z = texelFetch(input_stream, i++).x; + tmp.w = texelFetch(input_stream, i++).x; + tmp.x = gen_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes); + } + + mov(result, n, tmp.x); + } + + // Actual decoding step is done in vector space, outside the loop + if (desc.type == VTX_FMT_SNORM16 || desc.type == VTX_FMT_SINT16) + { + ret = sext(ivec4(result)); + ret = fma(vec4(0.5), vec4(desc.type == VTX_FMT_SNORM16), ret); + } + else if (desc.type == VTX_FMT_FLOAT32) + { + ret = uintBitsToFloat(result); + } + else if (desc.type == VTX_FMT_FLOAT16) + { + tmp.x = _set_bits(result.x, result.y, 16, 16); + tmp.y = _set_bits(result.z, result.w, 16, 16); + ret.xy = unpackHalf2x16(tmp.x); + ret.zw = unpackHalf2x16(tmp.y); + } + else if (elem_size == 1) // (desc.type == VTX_FMT_UINT8 || desc.type == VTX_FMT_UNORM8) + { + // Ignore bswap on single byte channels + ret = vec4(result); + } + else // if (desc.type == VTX_FMT_COMP32) + { + result = uvec4(_get_bits(result.x, 0, 11), + _get_bits(result.x, 11, 11), + _get_bits(result.x, 22, 10), + uint(scale.x)); + ret = sext(ivec4(result) << ivec4(5, 5, 6, 0)); + } + + if (desc.attribute_size < 4) + { + ret.w = scale.x; + } + + return ret / scale; +} + +attribute_desc fetch_desc(const in int location) +{ + // Each descriptor is 64 bits wide + // [0-8] attribute stride + // [8-24] attribute divisor + // [24-27] attribute type + // [27-30] attribute size + // [30-31] reserved + // [32-60] starting offset + // [60-61] swap bytes flag + // [61-62] volatile flag + // [62-63] modulo enable flag; + +#ifdef VULKAN + // Fetch parameters streamed separately from draw parameters + uvec2 attrib = texelFetch(vertex_layout_stream, location + int(layout_ptr_offset)).xy; +#else + // Data is packed into a ubo + int block = (location >> 1); + int sub_block = (location & 1) << 1; + uvec2 attrib = uvec2( + ref(input_attributes_blob[block], sub_block + 0), + ref(input_attributes_blob[block], sub_block + 1)); +#endif + + attribute_desc result; + result.stride = _get_bits(attrib.x, 0, 8); + result.frequency = _get_bits(attrib.x, 8, 16); + result.type = _get_bits(attrib.x, 24, 3); + result.attribute_size = _get_bits(attrib.x, 27, 3); + result.starting_offset = _get_bits(attrib.y, 0, 29); + result.swap_bytes = _test_bit(attrib.y, 29); + result.is_volatile = _test_bit(attrib.y, 30); + result.modulo = _test_bit(attrib.y, 31); + return result; +} + +vec4 read_location(const in int location) +{ + attribute_desc desc = fetch_desc(location); + int vertex_id = _gl_VertexID - int(vertex_base_index); + if (desc.frequency == 0) + { + vertex_id = 0; + } + else if (desc.modulo) + { + // if a vertex modifier is active; vertex_base must be 0 and is ignored + vertex_id = (_gl_VertexID + int(vertex_index_offset)) % int(desc.frequency); + } + else + { + vertex_id /= int(desc.frequency); + } + + if (desc.is_volatile) + { + return fetch_attribute(desc, vertex_id, volatile_input_stream); + } + else + { + return fetch_attribute(desc, vertex_id, persistent_input_stream); + } +} + +)" diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 8d3d6c04d16b..17051eb3054b 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -911,6 +911,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index d219df8c1839..0adf76dedf67 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -2430,5 +2430,8 @@ Emu\GPU\RSX\Program\Snippets\RSXProg + + Emu\GPU\RSX\Program\Snippets\RSXProg + \ No newline at end of file From 30326640ad3438f5f16ca510d14036640b7336a1 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 01:51:46 +0300 Subject: [PATCH 04/25] rsx: Fix fragment program codegen --- rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp | 2 +- .../Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 12275494db47..f644bd95ecf3 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -1069,7 +1069,7 @@ bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode) SetDst("_builtin_lit($0)"); properties.has_lit_op = true; return true; - case RSX_FP_OPCODE_LIF: SetDst("$Ty(1.0, $0.y, ($0.y > 0 ? pow(2.0, $0.w) : 0.0), 1.0)", OPFLAGS::op_extern); return true; + case RSX_FP_OPCODE_LIF: SetDst("$Ty(1.0, $0.y, ($0.y > 0 ? exp2($0.w) : 0.0), 1.0)", OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_LRP: SetDst("$Ty($2 * (1 - $0) + $1 * $0)", OPFLAGS::skip_type_cast); return true; case RSX_FP_OPCODE_LG2: SetDst("_builtin_log2($0.x).xxxx"); return true; // Pack operations. See https://www.khronos.org/registry/OpenGL/extensions/NV/NV_fragment_program.txt diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl index 786605a3c077..5a3a38539e27 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl @@ -4,7 +4,7 @@ R"( #define TEX2D_MS(index, coord2) process_texel(sampleTexture2DMS(TEX_NAME(index), coord2, index), TEX_FLAGS(index)) #define TEX2D_SHADOW_MS(index, coord3) vec4(comparison_passes(sampleTexture2DMS(TEX_NAME(index), coord3.xy, index).x, coord3.z, ZCOMPARE_FUNC(index))) #define TEX2D_SHADOWPROJ_MS(index, coord4) TEX2D_SHADOW_MS(index, (coord4.xyz / coord4.w)) -#define TEX2D_Z24X8_RGBA8_MS(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index))\n; +#define TEX2D_Z24X8_RGBA8_MS(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) vec3 compute2x2DownsampleWeights(const in float coord, const in float uv_step, const in float actual_step) { From c03be3d8b7658bde68f8a5bd5e020e7c0076b2a9 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 14:54:32 +0300 Subject: [PATCH 05/25] rsx: Rework texture coordinate handling to support clamping and a more sane scale-bias setup --- rpcs3/Emu/RSX/Common/TextureUtils.h | 14 +- rpcs3/Emu/RSX/Common/texture_cache.h | 22 +++- rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp | 5 +- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 4 +- .../RSX/Program/FragmentProgramDecompiler.cpp | 4 + .../RSX/Program/FragmentProgramDecompiler.h | 4 + rpcs3/Emu/RSX/Program/GLSLCommon.cpp | 99 +++++++++------ rpcs3/Emu/RSX/Program/GLSLCommon.h | 1 + .../GLSLSnippets/RSXProg/RSXDefines2.glsl | 10 +- .../RSXFragmentTextureDepthConversion.glsl | 6 +- .../RSXProg/RSXFragmentTextureMSAAOps.glsl | 4 +- .../RSXProg/RSXFragmentTextureOps.glsl | 120 ++++++++++++++---- rpcs3/Emu/RSX/Program/GLSLTypes.h | 15 ++- rpcs3/Emu/RSX/Program/program_util.h | 5 +- rpcs3/Emu/RSX/RSXThread.cpp | 16 ++- rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp | 5 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 6 +- 17 files changed, 240 insertions(+), 100 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index dd64418dd351..de2cbf5f445e 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -133,9 +133,17 @@ namespace rsx u8 samples = 1; u32 ref_address = 0; u64 surface_cache_tag = 0; - f32 scale_x = 1.f; - f32 scale_y = 1.f; - f32 scale_z = 1.f; + +#pragma pack(push, 1) + struct + { + f32 scale[3]; + f32 bias[3]; + f32 clamp_min[2]; + f32 clamp_max[2]; + bool clamp = false; + } texcoord_xform; +#pragma pack(pop) virtual ~sampled_image_descriptor_base() = default; virtual u32 encoded_component_map() const = 0; diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 479590f8557a..e822d4e5dc4f 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -179,11 +179,16 @@ namespace rsx upload_context = ctx; format_class = ftype; is_cyclic_reference = cyclic_reference; - scale_x = scale.width; - scale_y = scale.height; - scale_z = scale.depth; image_type = type; samples = msaa_samples; + + texcoord_xform.scale[0] = scale.width; + texcoord_xform.scale[1] = scale.height; + texcoord_xform.scale[2] = scale.depth; + texcoord_xform.bias[0] = 0.; + texcoord_xform.bias[1] = 0.; + texcoord_xform.bias[2] = 0.; + texcoord_xform.clamp = false; } sampled_image_descriptor(image_resource_type external_handle, deferred_request_command reason, @@ -196,10 +201,15 @@ namespace rsx image_handle = 0; upload_context = ctx; format_class = ftype; - scale_x = scale.width; - scale_y = scale.height; - scale_z = scale.depth; image_type = type; + + texcoord_xform.scale[0] = scale.width; + texcoord_xform.scale[1] = scale.height; + texcoord_xform.scale[2] = scale.depth; + texcoord_xform.bias[0] = 0.; + texcoord_xform.bias[1] = 0.; + texcoord_xform.bias[2] = 0.; + texcoord_xform.clamp = false; } void simplify() diff --git a/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp b/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp index 23c4f9095499..933459af3dbc 100644 --- a/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/GL/GLFragmentProgram.cpp @@ -187,7 +187,7 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.require_depth_conversion = properties.redirected_sampler_mask != 0; m_shader_props.require_wpos = !!(properties.in_register_mask & in_wpos); m_shader_props.require_texture_ops = properties.has_tex_op; - m_shader_props.require_shadow_ops = properties.shadow_sampler_mask != 0; + m_shader_props.require_tex_shadow_ops = properties.shadow_sampler_mask != 0; m_shader_props.require_texture_expand = properties.has_exp_tex_op; m_shader_props.require_srgb_to_linear = properties.has_upg; m_shader_props.require_linear_to_srgb = properties.has_pkg; @@ -198,6 +198,9 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.disable_early_discard = !::gl::get_driver_caps().vendor_NVIDIA; m_shader_props.supports_native_fp16 = device_props.has_native_half_support; m_shader_props.ROP_output_rounding = ::gl::get_driver_caps().vendor_NVIDIA; + m_shader_props.require_tex1D_ops = properties.has_tex1D; + m_shader_props.require_tex2D_ops = properties.has_tex2D; + m_shader_props.require_tex3D_ops = properties.has_tex3D; glsl::insert_glsl_legacy_function(OS, m_shader_props); } diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index ca1a1f4dba8a..d6c50ddecade 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -867,10 +867,10 @@ void GLGSRender::load_program_env() if (update_fragment_texture_env) { // Fragment texture parameters - auto mapping = m_texture_parameters_buffer->alloc_from_heap(512, m_uniform_buffer_offset_align); + auto mapping = m_texture_parameters_buffer->alloc_from_heap(768, m_uniform_buffer_offset_align); current_fragment_program.texture_params.write_to(mapping.first, current_fp_metadata.referenced_textures_mask); - m_texture_parameters_buffer->bind_range(GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, mapping.second, 512); + m_texture_parameters_buffer->bind_range(GL_FRAGMENT_TEXTURE_PARAMS_BIND_SLOT, mapping.second, 768); } if (update_raster_env) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index f644bd95ecf3..78cb3479ec79 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -253,15 +253,19 @@ std::string FragmentProgramDecompiler::AddTex() switch (m_prog.get_texture_dimension(dst.tex_num)) { case rsx::texture_dimension_extended::texture_dimension_1d: + properties.has_tex1D = true; sampler = "sampler1D"; break; case rsx::texture_dimension_extended::texture_dimension_cubemap: + properties.has_tex3D = true; sampler = "samplerCube"; break; case rsx::texture_dimension_extended::texture_dimension_2d: + properties.has_tex2D = true; sampler = "sampler2D"; break; case rsx::texture_dimension_extended::texture_dimension_3d: + properties.has_tex3D = true; sampler = "sampler3D"; break; } diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index c275eb2623dc..e866a240c7e1 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -288,6 +288,10 @@ class FragmentProgramDecompiler bool has_pkg = false; bool has_upg = false; bool has_dynamic_register_load = false; + + bool has_tex1D = false; + bool has_tex2D = false; + bool has_tex3D = false; } properties; diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp index 369933cf2a3b..f62ac0db0945 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp @@ -119,10 +119,10 @@ namespace glsl { "VTX_FMT_SNORM16", RSX_VERTEX_BASE_TYPE_SNORM16 }, { "VTX_FMT_FLOAT32", RSX_VERTEX_BASE_TYPE_FLOAT }, { "VTX_FMT_FLOAT16", RSX_VERTEX_BASE_TYPE_HALF_FLOAT }, - { "VTX_FMT_UNORM8", RSX_VERTEX_BASE_TYPE_UNORM8 }, - { "VTX_FMT_SINT16", RSX_VERTEX_BASE_TYPE_SINT16 }, - { "VTX_FMT_COMP32", RSX_VERTEX_BASE_TYPE_CMP32 }, - { "VTX_FMT_UINT8", RSX_VERTEX_BASE_TYPE_UINT8 } + { "VTX_FMT_UNORM8 ", RSX_VERTEX_BASE_TYPE_UNORM8 }, + { "VTX_FMT_SINT16 ", RSX_VERTEX_BASE_TYPE_SINT16 }, + { "VTX_FMT_COMP32 ", RSX_VERTEX_BASE_TYPE_CMP32 }, + { "VTX_FMT_UINT8 ", RSX_VERTEX_BASE_TYPE_UINT8 } }); // For intel GPUs which cannot access vectors in indexed mode (driver bug? or glsl version too low?) @@ -258,17 +258,17 @@ namespace glsl OS << "// ROP control\n"; program_common::define_glsl_constants(OS, { - { "ALPHA_TEST_ENABLE_BIT ", rsx::ROP_control_bits::ALPHA_TEST_ENABLE_BIT }, - { "SRGB_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::SRGB_FRAMEBUFFER_BIT }, - { "ALPHA_TO_COVERAGE_ENABLE_BIT ", rsx::ROP_control_bits::ALPHA_TO_COVERAGE_ENABLE_BIT }, - { "MSAA_WRITE_ENABLE_BIT ", rsx::ROP_control_bits::MSAA_WRITE_ENABLE_BIT }, - { "INT_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::INT_FRAMEBUFFER_BIT }, - { "POLYGON_STIPPLE_ENABLE_BIT ", rsx::ROP_control_bits::POLYGON_STIPPLE_ENABLE_BIT }, - { "ALPHA_TEST_FUNC_OFFSET ", rsx::ROP_control_bits::ALPHA_FUNC_OFFSET }, - { "ALPHA_TEST_FUNC_LENGTH ", rsx::ROP_control_bits::ALPHA_FUNC_NUM_BITS }, - { "MSAA_SAMPLE_CTRL_OFFSET ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_OFFSET }, - { "MSAA_SAMPLE_CTRL_LENGTH ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_NUM_BITS }, - { "ROP_CMD_MASK ", rsx::ROP_control_bits::ROP_CMD_MASK } + { "ALPHA_TEST_ENABLE_BIT ", rsx::ROP_control_bits::ALPHA_TEST_ENABLE_BIT }, + { "SRGB_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::SRGB_FRAMEBUFFER_BIT }, + { "ALPHA_TO_COVERAGE_ENABLE_BIT", rsx::ROP_control_bits::ALPHA_TO_COVERAGE_ENABLE_BIT }, + { "MSAA_WRITE_ENABLE_BIT ", rsx::ROP_control_bits::MSAA_WRITE_ENABLE_BIT }, + { "INT_FRAMEBUFFER_BIT ", rsx::ROP_control_bits::INT_FRAMEBUFFER_BIT }, + { "POLYGON_STIPPLE_ENABLE_BIT ", rsx::ROP_control_bits::POLYGON_STIPPLE_ENABLE_BIT }, + { "ALPHA_TEST_FUNC_OFFSET ", rsx::ROP_control_bits::ALPHA_FUNC_OFFSET }, + { "ALPHA_TEST_FUNC_LENGTH ", rsx::ROP_control_bits::ALPHA_FUNC_NUM_BITS }, + { "MSAA_SAMPLE_CTRL_OFFSET ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_OFFSET }, + { "MSAA_SAMPLE_CTRL_LENGTH ", rsx::ROP_control_bits::MSAA_SAMPLE_CTRL_NUM_BITS }, + { "ROP_CMD_MASK ", rsx::ROP_control_bits::ROP_CMD_MASK } }); if (props.fp32_outputs || !props.supports_native_fp16) @@ -343,12 +343,12 @@ namespace glsl { program_common::define_glsl_constants(OS, { - { "FOG_LINEAR", rsx::fog_mode::linear }, - { "FOG_EXP", rsx::fog_mode::exponential }, - { "FOG_EXP2", rsx::fog_mode::exponential2 }, + { "FOG_LINEAR ", rsx::fog_mode::linear }, + { "FOG_EXP ", rsx::fog_mode::exponential }, + { "FOG_EXP2 ", rsx::fog_mode::exponential2 }, { "FOG_LINEAR_ABS", rsx::fog_mode::linear_abs }, - { "FOG_EXP_ABS", rsx::fog_mode::exponential_abs }, - { "FOG_EXP2_ABS", rsx::fog_mode::exponential2_abs }, + { "FOG_EXP_ABS ", rsx::fog_mode::exponential_abs }, + { "FOG_EXP2_ABS ", rsx::fog_mode::exponential2_abs }, }); enabled_options.push_back("_ENABLE_FOG_READ"); @@ -365,25 +365,26 @@ namespace glsl if (props.require_texture_ops) { // Declare special texture control flags - OS << "#define GAMMA_R_MASK (1 << " << rsx::texture_control_bits::GAMMA_R << ")\n"; - OS << "#define GAMMA_G_MASK (1 << " << rsx::texture_control_bits::GAMMA_G << ")\n"; - OS << "#define GAMMA_B_MASK (1 << " << rsx::texture_control_bits::GAMMA_B << ")\n"; - OS << "#define GAMMA_A_MASK (1 << " << rsx::texture_control_bits::GAMMA_A << ")\n"; - OS << "#define EXPAND_R_MASK (1 << " << rsx::texture_control_bits::EXPAND_R << ")\n"; - OS << "#define EXPAND_G_MASK (1 << " << rsx::texture_control_bits::EXPAND_G << ")\n"; - OS << "#define EXPAND_B_MASK (1 << " << rsx::texture_control_bits::EXPAND_B << ")\n"; - OS << "#define EXPAND_A_MASK (1 << " << rsx::texture_control_bits::EXPAND_A << ")\n\n"; - - OS << "#define ALPHAKILL " << rsx::texture_control_bits::ALPHAKILL << "\n"; - OS << "#define RENORMALIZE " << rsx::texture_control_bits::RENORMALIZE << "\n"; - OS << "#define DEPTH_FLOAT " << rsx::texture_control_bits::DEPTH_FLOAT << "\n"; - OS << "#define DEPTH_COMPARE " << rsx::texture_control_bits::DEPTH_COMPARE_OP << "\n"; - OS << "#define FILTERED_MAG_BIT " << rsx::texture_control_bits::FILTERED_MAG << "\n"; - OS << "#define FILTERED_MIN_BIT " << rsx::texture_control_bits::FILTERED_MIN << "\n"; - OS << "#define INT_COORDS_BIT " << rsx::texture_control_bits::UNNORMALIZED_COORDS << "\n"; - OS << "#define GAMMA_CTRL_MASK (GAMMA_R_MASK|GAMMA_G_MASK|GAMMA_B_MASK|GAMMA_A_MASK)\n"; - OS << "#define SIGN_EXPAND_MASK (EXPAND_R_MASK|EXPAND_G_MASK|EXPAND_B_MASK|EXPAND_A_MASK)\n"; - OS << "#define FILTERED_MASK (FILTERED_MAG_BIT|FILTERED_MIN_BIT)\n\n"; + program_common::define_glsl_constants(OS, + { + { "GAMMA_R_BIT " , rsx::texture_control_bits::GAMMA_R }, + { "GAMMA_G_BIT " , rsx::texture_control_bits::GAMMA_G }, + { "GAMMA_B_BIT " , rsx::texture_control_bits::GAMMA_B }, + { "GAMMA_A_BIT " , rsx::texture_control_bits::GAMMA_A }, + { "EXPAND_R_BIT" , rsx::texture_control_bits::EXPAND_R }, + { "EXPAND_G_BIT" , rsx::texture_control_bits::EXPAND_G }, + { "EXPAND_B_BIT" , rsx::texture_control_bits::EXPAND_B }, + { "EXPAND_A_BIT" , rsx::texture_control_bits::EXPAND_A }, + + { "ALPHAKILL ", rsx::texture_control_bits::ALPHAKILL }, + { "RENORMALIZE ", rsx::texture_control_bits::RENORMALIZE }, + { "DEPTH_FLOAT ", rsx::texture_control_bits::DEPTH_FLOAT }, + { "DEPTH_COMPARE", rsx::texture_control_bits::DEPTH_COMPARE_OP }, + { "FILTERED_MAG_BIT", rsx::texture_control_bits::FILTERED_MAG }, + { "FILTERED_MIN_BIT", rsx::texture_control_bits::FILTERED_MIN }, + { "INT_COORDS_BIT ", rsx::texture_control_bits::UNNORMALIZED_COORDS }, + { "CLAMP_COORDS_BIT", rsx::texture_control_bits::CLAMP_TEXCOORDS_BIT } + }); if (props.require_texture_expand) { @@ -395,6 +396,26 @@ namespace glsl enabled_options.push_back("_EMULATED_TEXSHADOW"); } + if (props.require_tex_shadow_ops) + { + enabled_options.push_back("_ENABLE_SHADOW"); + } + + if (props.require_tex1D_ops) + { + enabled_options.push_back("_ENABLE_TEX1D"); + } + + if (props.require_tex2D_ops) + { + enabled_options.push_back("_ENABLE_TEX2D"); + } + + if (props.require_tex3D_ops) + { + enabled_options.push_back("_ENABLE_TEX3D"); + } + program_common::define_glsl_switches(OS, enabled_options); enabled_options.clear(); diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.h b/rpcs3/Emu/RSX/Program/GLSLCommon.h index dba5c1c63034..0ad55477aedd 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.h +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.h @@ -29,6 +29,7 @@ namespace rsx FILTERED_MAG, FILTERED_MIN, UNNORMALIZED_COORDS, + CLAMP_TEXCOORDS_BIT, GAMMA_CTRL_MASK = (1 << GAMMA_R) | (1 << GAMMA_G) | (1 << GAMMA_B) | (1 << GAMMA_A), EXPAND_MASK = (1 << EXPAND_R) | (1 << EXPAND_G) | (1 << EXPAND_B) | (1 << EXPAND_A), diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl index eaf21d252c22..2b07c25f6f9a 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXDefines2.glsl @@ -1,10 +1,14 @@ R"( // Small structures that should be defined before any backend logic +// Avoid arrays and sub-vec4 members because of std140 padding constraints struct sampler_info { - vec4 scale_bias; - uint remap; - uint flags; + float scale_x, scale_y, scale_z; // 12 + float bias_x, bias_y, bias_z; // 24 + float clamp_min_x, clamp_min_y; // 32 + float clamp_max_x, clamp_max_y; // 40 + uint remap; // 44 + uint flags; // 48 }; )" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl index 68cea535ce05..01eb79803f15 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureDepthConversion.glsl @@ -1,8 +1,8 @@ R"( #define ZS_READ(index, coord) vec2(texture(TEX_NAME(index), coord).r, float(texture(TEX_NAME_STENCIL(index), coord).x)) -#define TEX1D_Z24X8_RGBA8(index, coord1) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE1(index, coord1)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) -#define TEX2D_Z24X8_RGBA8(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE2(index, coord2)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) -#define TEX3D_Z24X8_RGBA8(index, coord3) process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE3(index, coord3)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX1D_Z24X8_RGBA8(index, coord1) _process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE1(index, coord1)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX2D_Z24X8_RGBA8(index, coord2) _process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE2(index, coord2)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX3D_Z24X8_RGBA8(index, coord3) _process_texel(convert_z24x8_to_rgba8(ZS_READ(index, COORD_SCALE3(index, coord3)), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) // NOTE: Memory layout is fetched as byteswapped BGRA [GBAR] (GOW collection, DS2, DeS) // The A component (Z) is useless (should contain stencil8 or just 1) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl index 5a3a38539e27..2ceef0d6a2b6 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureMSAAOps.glsl @@ -1,10 +1,10 @@ R"( #define ZCOMPARE_FUNC(index) _get_bits(TEX_FLAGS(index), DEPTH_COMPARE, 3) #define ZS_READ_MS(index, coord) vec2(sampleTexture2DMS(TEX_NAME(index), coord, index).r, float(sampleTexture2DMS(TEX_NAME_STENCIL(index), coord, index).x)) -#define TEX2D_MS(index, coord2) process_texel(sampleTexture2DMS(TEX_NAME(index), coord2, index), TEX_FLAGS(index)) +#define TEX2D_MS(index, coord2) _process_texel(sampleTexture2DMS(TEX_NAME(index), coord2, index), TEX_FLAGS(index)) #define TEX2D_SHADOW_MS(index, coord3) vec4(comparison_passes(sampleTexture2DMS(TEX_NAME(index), coord3.xy, index).x, coord3.z, ZCOMPARE_FUNC(index))) #define TEX2D_SHADOWPROJ_MS(index, coord4) TEX2D_SHADOW_MS(index, (coord4.xyz / coord4.w)) -#define TEX2D_Z24X8_RGBA8_MS(index, coord2) process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) +#define TEX2D_Z24X8_RGBA8_MS(index, coord2) _process_texel(convert_z24x8_to_rgba8(ZS_READ_MS(index, coord2), texture_parameters[index].remap, TEX_FLAGS(index)), TEX_FLAGS(index)) vec3 compute2x2DownsampleWeights(const in float coord, const in float uv_step, const in float actual_step) { diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl index df897eeef42c..d8a127ad1e27 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl @@ -1,4 +1,17 @@ R"( +#define GAMMA_R_MASK (1 << GAMMA_R_BIT) +#define GAMMA_G_MASK (1 << GAMMA_G_BIT) +#define GAMMA_B_MASK (1 << GAMMA_B_BIT) +#define GAMMA_A_MASK (1 << GAMMA_A_BIT) +#define EXPAND_R_MASK (1 << EXPAND_R_BIT) +#define EXPAND_G_MASK (1 << EXPAND_G_BIT) +#define EXPAND_B_MASK (1 << EXPAND_B_BIT) +#define EXPAND_A_MASK (1 << EXPAND_A_BIT) + +#define GAMMA_CTRL_MASK (GAMMA_R_MASK | GAMMA_G_MASK | GAMMA_B_MASK | GAMMA_A_MASK) +#define SIGN_EXPAND_MASK (EXPAND_R_MASK | EXPAND_G_MASK | EXPAND_B_MASK | EXPAND_A_MASK) +#define FILTERED_MASK (FILTERED_MAG_BIT | FILTERED_MIN_BIT) + #ifdef _ENABLE_TEXTURE_EXPAND uint _texture_flag_override = 0; #define _enable_texture_expand() _texture_flag_override = SIGN_EXPAND_MASK @@ -11,22 +24,27 @@ R"( #define TEX_NAME(index) tex##index #define TEX_NAME_STENCIL(index) tex##index##_stencil -#define COORD_SCALE1(index, coord1) ((coord1 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.x) -#define COORD_SCALE2(index, coord2) ((coord2 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xy) -#define COORD_SCALE3(index, coord3) ((coord3 + texture_parameters[index].scale_bias.w) * texture_parameters[index].scale_bias.xyz) +#define COORD_SCALE1(index, coord1) _texcoord_xform(coord1, texture_parameters[index]) +#define COORD_SCALE2(index, coord2) _texcoord_xform(coord2, texture_parameters[index]) +#define COORD_SCALE3(index, coord3) _texcoord_xform(coord3, texture_parameters[index]) -#define TEX1D(index, coord1) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1)), TEX_FLAGS(index)) -#define TEX1D_BIAS(index, coord1, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1), bias), TEX_FLAGS(index)) -#define TEX1D_LOD(index, coord1, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE1(index, coord1), lod), TEX_FLAGS(index)) -#define TEX1D_GRAD(index, coord1, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE1(index, coord1), dpdx, dpdy), TEX_FLAGS(index)) -#define TEX1D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec2(COORD_SCALE1(index, coord4.x), coord4.w)), TEX_FLAGS(index)) +#ifdef _ENABLE_TEX1D +#define TEX1D(index, coord1) _process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1)), TEX_FLAGS(index)) +#define TEX1D_BIAS(index, coord1, bias) _process_texel(texture(TEX_NAME(index), COORD_SCALE1(index, coord1), bias), TEX_FLAGS(index)) +#define TEX1D_LOD(index, coord1, lod) _process_texel(textureLod(TEX_NAME(index), COORD_SCALE1(index, coord1), lod), TEX_FLAGS(index)) +#define TEX1D_GRAD(index, coord1, dpdx, dpdy) _process_texel(textureGrad(TEX_NAME(index), COORD_SCALE1(index, coord1), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX1D_PROJ(index, coord4) _process_texel(textureProj(TEX_NAME(index), vec2(COORD_SCALE1(index, coord4.x), coord4.w)), TEX_FLAGS(index)) +#endif -#define TEX2D(index, coord2) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2)), TEX_FLAGS(index)) -#define TEX2D_BIAS(index, coord2, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2), bias), TEX_FLAGS(index)) -#define TEX2D_LOD(index, coord2, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE2(index, coord2), lod), TEX_FLAGS(index)) -#define TEX2D_GRAD(index, coord2, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE2(index, coord2), dpdx, dpdy), TEX_FLAGS(index)) -#define TEX2D_PROJ(index, coord4) process_texel(textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.z, coord4.w)), TEX_FLAGS(index)) +#ifdef _ENABLE_TEX2D +#define TEX2D(index, coord2) _process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2)), TEX_FLAGS(index)) +#define TEX2D_BIAS(index, coord2, bias) _process_texel(texture(TEX_NAME(index), COORD_SCALE2(index, coord2), bias), TEX_FLAGS(index)) +#define TEX2D_LOD(index, coord2, lod) _process_texel(textureLod(TEX_NAME(index), COORD_SCALE2(index, coord2), lod), TEX_FLAGS(index)) +#define TEX2D_GRAD(index, coord2, dpdx, dpdy) _process_texel(textureGrad(TEX_NAME(index), COORD_SCALE2(index, coord2), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX2D_PROJ(index, coord4) _process_texel(textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.z, coord4.w)), TEX_FLAGS(index)) +#endif +#ifdef _ENABLE_SHADOW #ifdef _EMULATED_TEXSHADOW #define SHADOW_COORD(index, coord3) vec3(COORD_SCALE2(index, coord3.xy), _test_bit(TEX_FLAGS(index), DEPTH_FLOAT)? coord3.z : min(float(coord3.z), 1.0)) #define SHADOW_COORD4(index, coord4) vec4(SHADOW_COORD(index, coord4.xyz), coord4.w) @@ -40,20 +58,72 @@ R"( #define TEX3D_SHADOW(index, coord4) texture(TEX_NAME(index), vec4(COORD_SCALE3(index, coord4.xyz), coord4.w)) #define TEX2D_SHADOWPROJ(index, coord4) textureProj(TEX_NAME(index), vec4(COORD_SCALE2(index, coord4.xy), coord4.zw)) #endif +#endif + +#ifdef _ENABLE_TEX3D +#define TEX3D(index, coord3) _process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3)), TEX_FLAGS(index)) +#define TEX3D_BIAS(index, coord3, bias) _process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3), bias), TEX_FLAGS(index)) +#define TEX3D_LOD(index, coord3, lod) _process_texel(textureLod(TEX_NAME(index), COORD_SCALE3(index, coord3), lod), TEX_FLAGS(index)) +#define TEX3D_GRAD(index, coord3, dpdx, dpdy) _process_texel(textureGrad(TEX_NAME(index), COORD_SCALE3(index, coord3), dpdx, dpdy), TEX_FLAGS(index)) +#define TEX3D_PROJ(index, coord4) _process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord4.xyz) / coord4.w), TEX_FLAGS(index)) +#endif + +#ifdef _ENABLE_TEX1D +float _texcoord_xform(const in float coord, const in sampler_info params) +{ + float result = fma(coord, params.scale_x, params.bias_x); + if (_test_bit(params.flags, CLAMP_COORDS_BIT)) + { + result = clamp(result, params.clamp_min_x, params.clamp_max_x); + } + + return result; +} +#endif + +#ifdef _ENABLE_TEX2D +vec2 _texcoord_xform(const in vec2 coord, const in sampler_info params) +{ + float result = fma( + coord, + vec2(params.scale_x, params.scale_y), + vec2(params.bias_x, params.bias_y) + ); + + if (_test_bit(params.flags, CLAMP_COORDS_BIT)) + { + result = clamp( + result, + vec2(params.clamp_min_x, params.clamp_min_y), + vec2(params.clamp_max_x, params.clamp_max_y) + ); + } + + return result; +} +#endif + +#ifdef _ENABLE_TEX3D +vec3 _texcoord_xform(const in vec3 coord, const in sampler_info params) +{ + float result = fma( + coord, + vec3(params.scale_x, params.scale_y, params.scale_z), + vec3(params.bias_x, params.bias_y, params.bias_z) + ); -#define TEX3D(index, coord3) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3)), TEX_FLAGS(index)) -#define TEX3D_BIAS(index, coord3, bias) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord3), bias), TEX_FLAGS(index)) -#define TEX3D_LOD(index, coord3, lod) process_texel(textureLod(TEX_NAME(index), COORD_SCALE3(index, coord3), lod), TEX_FLAGS(index)) -#define TEX3D_GRAD(index, coord3, dpdx, dpdy) process_texel(textureGrad(TEX_NAME(index), COORD_SCALE3(index, coord3), dpdx, dpdy), TEX_FLAGS(index)) -#define TEX3D_PROJ(index, coord4) process_texel(texture(TEX_NAME(index), COORD_SCALE3(index, coord4.xyz) / coord4.w), TEX_FLAGS(index)) + // NOTE: Coordinate clamping not supported for CUBE and 3D textures + return result; +} +#endif -vec4 process_texel(in vec4 rgba, const in uint control_bits) +vec4 _process_texel(in vec4 rgba, const in uint control_bits) { if (control_bits == 0) { return rgba; } - + if (_test_bit(control_bits, ALPHAKILL)) { // Alphakill @@ -63,18 +133,18 @@ vec4 process_texel(in vec4 rgba, const in uint control_bits) return rgba; } } - + if (_test_bit(control_bits, RENORMALIZE)) { // Renormalize to 8-bit (PS3) accuracy rgba = floor(rgba * 255.); rgba /= 255.; } - + uvec4 mask; vec4 convert; uint op_mask = control_bits & uint(SIGN_EXPAND_MASK); - + if (op_mask != 0) { // Expand to signed normalized @@ -82,7 +152,7 @@ vec4 process_texel(in vec4 rgba, const in uint control_bits) convert = (rgba * 2.f - 1.f); rgba = _select(rgba, convert, notEqual(mask, uvec4(0))); } - + op_mask = control_bits & uint(GAMMA_CTRL_MASK); if (op_mask != 0u) { @@ -91,7 +161,7 @@ vec4 process_texel(in vec4 rgba, const in uint control_bits) convert = srgb_to_linear(rgba); return _select(rgba, convert, notEqual(mask, uvec4(0))); } - + return rgba; } diff --git a/rpcs3/Emu/RSX/Program/GLSLTypes.h b/rpcs3/Emu/RSX/Program/GLSLTypes.h index 4e31a369efef..0de51066fc85 100644 --- a/rpcs3/Emu/RSX/Program/GLSLTypes.h +++ b/rpcs3/Emu/RSX/Program/GLSLTypes.h @@ -24,11 +24,6 @@ namespace glsl // Only relevant for fragment programs bool fp32_outputs : 1; bool require_wpos : 1; - bool require_depth_conversion : 1; - bool require_texture_ops : 1; - bool require_shadow_ops : 1; - bool require_msaa_ops : 1; - bool require_texture_expand : 1; bool require_srgb_to_linear : 1; bool require_linear_to_srgb : 1; bool require_explicit_invariance: 1; @@ -41,5 +36,15 @@ namespace glsl bool disable_early_discard : 1; bool supports_native_fp16 : 1; bool ROP_output_rounding : 1; + + // Texturing spec + bool require_texture_ops : 1; // Global switch to enable/disable all texture code + bool require_depth_conversion : 1; // Include DSV<->RTV bitcast emulation + bool require_tex_shadow_ops : 1; // Include shadow compare emulation + bool require_msaa_ops : 1; // Include MSAA<->Resolved bitcast emulation + bool require_texture_expand : 1; // Include sign-expansion emulation + bool require_tex1D_ops : 1; // Include 1D texture stuff + bool require_tex2D_ops : 1; // Include 2D texture stuff + bool require_tex3D_ops : 1; // Include 3D texture stuff (including cubemap) }; }; diff --git a/rpcs3/Emu/RSX/Program/program_util.h b/rpcs3/Emu/RSX/Program/program_util.h index d6fd99110125..79742901803b 100644 --- a/rpcs3/Emu/RSX/Program/program_util.h +++ b/rpcs3/Emu/RSX/Program/program_util.h @@ -17,10 +17,11 @@ namespace rsx struct TIU_slot { float scale[3]; - float subpixel_bias; + float bias[3]; + float clamp_min[2]; + float clamp_max[2]; u32 remap; u32 control; - u32 padding[2]; } slots_[16]; // QT headers will collide with any variable named 'slots' because reasons diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 259a39067881..283cee7b847a 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2296,10 +2296,7 @@ namespace rsx if (tex.enabled() && sampler_descriptors[i]->format_class != RSX_FORMAT_CLASS_UNDEFINED) { - current_fragment_program.texture_params[i].scale[0] = sampler_descriptors[i]->scale_x; - current_fragment_program.texture_params[i].scale[1] = sampler_descriptors[i]->scale_y; - current_fragment_program.texture_params[i].scale[2] = sampler_descriptors[i]->scale_z; - current_fragment_program.texture_params[i].subpixel_bias = 0.f; + std::memcpy(current_fragment_program.texture_params[i].scale, sampler_descriptors[i]->texcoord_xform.scale, 10 * sizeof(float)); current_fragment_program.texture_params[i].remap = tex.remap(); m_graphics_state |= rsx::pipeline_state::fragment_texture_state_dirty; @@ -2307,6 +2304,11 @@ namespace rsx u32 texture_control = 0; current_fp_texture_state.set_dimension(sampler_descriptors[i]->image_type, i); + if (sampler_descriptors[i]->texcoord_xform.clamp) + { + texture_control |= (1 << rsx::texture_control_bits::CLAMP_TEXCOORDS_BIT); + } + if (tex.alpha_kill_enabled()) { //alphakill can be ignored unless a valid comparison function is set @@ -2324,7 +2326,11 @@ namespace rsx { // Subpixel offset so that (X + bias) * scale will round correctly. // This is done to work around fdiv precision issues in some GPUs (NVIDIA) - current_fragment_program.texture_params[i].subpixel_bias = 0.01f; + // We apply the simplification where (x + bias) * z = xz + zbias here. + const auto subpixel_bias = 0.01f; + current_fragment_program.texture_params[i].bias[0] += (subpixel_bias * current_fragment_program.texture_params[i].scale[0]); + current_fragment_program.texture_params[i].bias[1] += (subpixel_bias * current_fragment_program.texture_params[i].scale[1]); + current_fragment_program.texture_params[i].bias[2] += (subpixel_bias * current_fragment_program.texture_params[i].scale[2]); } } diff --git a/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp b/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp index dd7a83d8cd9b..e2e9b1039cc1 100644 --- a/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp +++ b/rpcs3/Emu/RSX/VK/VKFragmentProgram.cpp @@ -238,7 +238,7 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.require_depth_conversion = properties.redirected_sampler_mask != 0; m_shader_props.require_wpos = !!(properties.in_register_mask & in_wpos); m_shader_props.require_texture_ops = properties.has_tex_op; - m_shader_props.require_shadow_ops = properties.shadow_sampler_mask != 0; + m_shader_props.require_tex_shadow_ops = properties.shadow_sampler_mask != 0; m_shader_props.require_msaa_ops = m_prog.texture_state.multisampled_textures != 0; m_shader_props.require_texture_expand = properties.has_exp_tex_op; m_shader_props.require_srgb_to_linear = properties.has_upg; @@ -250,6 +250,9 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS) m_shader_props.disable_early_discard = vk::get_driver_vendor() != vk::driver_vendor::NVIDIA; m_shader_props.supports_native_fp16 = device_props.has_native_half_support; m_shader_props.ROP_output_rounding = vk::get_driver_vendor() == vk::driver_vendor::NVIDIA; + m_shader_props.require_tex1D_ops = properties.has_tex1D; + m_shader_props.require_tex2D_ops = properties.has_tex2D; + m_shader_props.require_tex3D_ops = properties.has_tex3D; glsl::insert_glsl_legacy_function(OS, m_shader_props); } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index f5cf2de26af7..56a38e6345b0 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2176,12 +2176,12 @@ void VKGSRender::load_program_env() { check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE); - auto mem = m_fragment_texture_params_ring_info.alloc<256>(512); - auto buf = m_fragment_texture_params_ring_info.map(mem, 512); + auto mem = m_fragment_texture_params_ring_info.alloc<256>(768); + auto buf = m_fragment_texture_params_ring_info.map(mem, 768); current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask); m_fragment_texture_params_ring_info.unmap(); - m_fragment_texture_params_buffer_info = { m_fragment_texture_params_ring_info.heap->value, mem, 512 }; + m_fragment_texture_params_buffer_info = { m_fragment_texture_params_ring_info.heap->value, mem, 768 }; } if (update_raster_env) From 8ba6bf0b24c5b71e251d1569d8c93962a4e7fc5b Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 15:01:58 +0300 Subject: [PATCH 06/25] rsx: Fix shader compilation when texture ops are referenced --- .../GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl | 4 ++-- rpcs3/Emu/RSX/Program/GLSLTypes.h | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl index d8a127ad1e27..3cf30f86aa76 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentTextureOps.glsl @@ -84,7 +84,7 @@ float _texcoord_xform(const in float coord, const in sampler_info params) #ifdef _ENABLE_TEX2D vec2 _texcoord_xform(const in vec2 coord, const in sampler_info params) { - float result = fma( + vec2 result = fma( coord, vec2(params.scale_x, params.scale_y), vec2(params.bias_x, params.bias_y) @@ -106,7 +106,7 @@ vec2 _texcoord_xform(const in vec2 coord, const in sampler_info params) #ifdef _ENABLE_TEX3D vec3 _texcoord_xform(const in vec3 coord, const in sampler_info params) { - float result = fma( + vec3 result = fma( coord, vec3(params.scale_x, params.scale_y, params.scale_z), vec3(params.bias_x, params.bias_y, params.bias_z) diff --git a/rpcs3/Emu/RSX/Program/GLSLTypes.h b/rpcs3/Emu/RSX/Program/GLSLTypes.h index 0de51066fc85..41b192fa68f8 100644 --- a/rpcs3/Emu/RSX/Program/GLSLTypes.h +++ b/rpcs3/Emu/RSX/Program/GLSLTypes.h @@ -18,20 +18,21 @@ namespace glsl struct shader_properties { glsl::program_domain domain : 3; + // Applicable in vertex stage bool require_lit_emulation : 1; + bool require_explicit_invariance : 1; + bool emulate_zclip_transform : 1; + bool emulate_depth_clip_only : 1; // Only relevant for fragment programs bool fp32_outputs : 1; bool require_wpos : 1; bool require_srgb_to_linear : 1; bool require_linear_to_srgb : 1; - bool require_explicit_invariance: 1; bool require_fog_read : 1; bool emulate_coverage_tests : 1; bool emulate_shadow_compare : 1; - bool emulate_zclip_transform : 1; - bool emulate_depth_clip_only : 1; bool low_precision_tests : 1; bool disable_early_discard : 1; bool supports_native_fp16 : 1; From cc30127cc57a78d92011f62b0f89a724da118e61 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 15:15:38 +0300 Subject: [PATCH 07/25] rsx: Minor optimization to speed up texture state sync a bit given the larger structure in use now --- rpcs3/Emu/RSX/RSXThread.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 283cee7b847a..4f367fd42211 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2296,7 +2296,7 @@ namespace rsx if (tex.enabled() && sampler_descriptors[i]->format_class != RSX_FORMAT_CLASS_UNDEFINED) { - std::memcpy(current_fragment_program.texture_params[i].scale, sampler_descriptors[i]->texcoord_xform.scale, 10 * sizeof(float)); + std::memcpy(current_fragment_program.texture_params[i].scale, sampler_descriptors[i]->texcoord_xform.scale, 6 * sizeof(float)); current_fragment_program.texture_params[i].remap = tex.remap(); m_graphics_state |= rsx::pipeline_state::fragment_texture_state_dirty; @@ -2306,6 +2306,7 @@ namespace rsx if (sampler_descriptors[i]->texcoord_xform.clamp) { + std::memcpy(current_fragment_program.texture_params[i].clamp_min, sampler_descriptors[i]->texcoord_xform.clamp_min, 4 * sizeof(float)); texture_control |= (1 << rsx::texture_control_bits::CLAMP_TEXCOORDS_BIT); } From 7acec35c8566ccbbefa8ff9b92c6bd245ac3ee45 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 19:09:15 +0300 Subject: [PATCH 08/25] rsx: Implement GPU copy ellision through coordinate transform - TODO: Some corner cases still exist where format may not be a match after a cache merge. --- rpcs3/Emu/RSX/Common/texture_cache.h | 41 +++++++++-- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 71 ++++++++++++++++++-- rpcs3/Emu/RSX/GL/GLTextureCache.h | 3 +- rpcs3/Emu/RSX/VK/VKTextureCache.h | 3 +- 4 files changed, 104 insertions(+), 14 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index e822d4e5dc4f..f41e48ee3b54 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -26,6 +26,7 @@ namespace rsx using image_view_type = typename traits::image_view_type; using image_storage_type = typename traits::image_storage_type; using texture_format = typename traits::texture_format; + using viewable_image_type = typename traits::viewable_image_type; using predictor_type = texture_cache_predictor; using ranged_storage = rsx::ranged_storage; @@ -161,6 +162,11 @@ namespace rsx { static_cast(*this) = attr; } + + viewable_image_type as_viewable() const + { + return static_cast(external_handle); + } }; struct sampled_image_descriptor : public sampled_image_descriptor_base @@ -1904,6 +1910,17 @@ namespace rsx auto new_attr = attr; new_attr.gcm_format = gcm_format; + if (last->get_gcm_format() == attr.gcm_format && attr.edge_clamped) + { + // Clipped view + auto viewed_image = last->get_raw_texture(); + sampled_image_descriptor result = { viewed_image->get_view(encoded_remap, remap), last->get_context(), + viewed_image->format_class(), scale, extended_dimension, false, viewed_image->samples() }; + + helpers::calculate_sample_clip_parameters(result, position2i(0, 0), size2i(attr.width, attr.height), size2i(normalized_width, last->get_height())); + return result; + } + return { last->get_raw_texture(), deferred_request_command::copy_image_static, new_attr, {}, last->get_context(), classify_format(gcm_format), scale, extended_dimension, remap }; } @@ -1912,15 +1929,27 @@ namespace rsx auto result = helpers::merge_cache_resources( cmd, overlapping_fbos, overlapping_locals, attr, scale, extended_dimension, encoded_remap, remap, _pool); + const bool is_simple_subresource_copy = + (result.external_subresource_desc.op == deferred_request_command::copy_image_static) || + (result.external_subresource_desc.op == deferred_request_command::copy_image_dynamic); + + if (is_simple_subresource_copy && attr.edge_clamped) + { + helpers::convert_image_copy_to_clip_descriptor( + result, + position2i(result.external_subresource_desc.x, result.external_subresource_desc.y), + size2i(result.external_subresource_desc.width, result.external_subresource_desc.width), + size2i(result.external_subresource_desc.external_handle->width(), result.external_subresource_desc.external_handle->height()), + encoded_remap, remap, false /*FIXME*/); + + return result; + } + if (options.skip_texture_merge) { - switch (result.external_subresource_desc.op) + if (is_simple_subresource_copy) { - case deferred_request_command::copy_image_static: - case deferred_request_command::copy_image_dynamic: return result; - default: - break; } return {}; @@ -2146,12 +2175,14 @@ namespace rsx attributes.depth = 1; attributes.height = 1; attributes.slice_h = 1; + attributes.edge_clamped = (tex.wrap_s() == rsx::texture_wrap_mode::clamp_to_edge); scale.height = scale.depth = 0.f; subsurface_count = 1; required_surface_height = 1; break; case rsx::texture_dimension_extended::texture_dimension_2d: attributes.depth = 1; + attributes.edge_clamped = (tex.wrap_s() == rsx::texture_wrap_mode::clamp_to_edge && tex.wrap_t() == rsx::texture_wrap_mode::clamp_to_edge); scale.depth = 0.f; subsurface_count = options.is_compressed_format? 1 : tex.get_exact_mipmap_count(); attributes.slice_h = required_surface_height = attributes.height; diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 0686a84e539d..79038b9bd88d 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -58,6 +58,7 @@ namespace rsx u16 slice_h; u8 bpp; bool swizzled; + bool edge_clamped; }; struct blit_op_result @@ -501,6 +502,47 @@ namespace rsx return false; } + template + void calculate_sample_clip_parameters( + sampled_image_descriptor& desc, + const position2i& offset, + const size2i& desired_dimensions, + const size2i& actual_dimensions) + { + const f32 scale_x = f32(desired_dimensions.width) / actual_dimensions.width; + const f32 scale_y = f32(desired_dimensions.height) / actual_dimensions.height; + const f32 offset_x = f32(offset.x) / actual_dimensions.width; + const f32 offset_y = f32(offset.y) / actual_dimensions.height; + + desc.texcoord_xform.scale[0] *= scale_x; + desc.texcoord_xform.scale[1] *= scale_y; + desc.texcoord_xform.bias[0] += offset_x; + desc.texcoord_xform.bias[1] += offset_y; + desc.texcoord_xform.clamp_min[0] = offset_x; + desc.texcoord_xform.clamp_min[1] = offset_y; + desc.texcoord_xform.clamp_max[0] = offset_x + scale_x; + desc.texcoord_xform.clamp_max[1] = offset_y + scale_y; + desc.texcoord_xform.clamp = true; + } + + template + void convert_image_copy_to_clip_descriptor( + sampled_image_descriptor& desc, + const position2i& offset, + const size2i& desired_dimensions, + const size2i& actual_dimensions, + u32 encoded_remap, + const texture_channel_remap_t& decoded_remap, + bool cyclic_reference) + { + desc.image_handle = desc.external_subresource_desc.as_viewable()->get_view(encoded_remap, decoded_remap); + desc.is_cyclic_reference = cyclic_reference; + desc.samples = desc.external_subresource_desc.external_handle->samples(); + desc.external_subresource_desc = {}; + + calculate_sample_clip_parameters(desc, offset, desired_dimensions, actual_dimensions); + } + template sampled_image_descriptor process_framebuffer_resource_fast(commandbuffer_type& cmd, render_target_type texptr, @@ -557,22 +599,30 @@ namespace rsx ensure(attr.height == 1); } - bool requires_processing = false; + // A GPU operation must be performed on the data before sampling. Implies transfer_read access. + bool requires_processing = force_convert; + // A GPU clip operation may be performed by combining texture coordinate scaling with a clamp. + bool requires_clip = false; + rsx::surface_access access_type = rsx::surface_access::shader_read; - if (attr.width != surface_width || attr.height != surface_height || force_convert) + if (attr.width != surface_width || attr.height != surface_height) { - // A GPU operation must be performed on the data before sampling. Implies transfer_read access - requires_processing = true; + // If we can get away with clip only, do it + if (attr.edge_clamped) + requires_clip = true; + else + requires_processing = true; } - else if (surface_is_rop_target && g_cfg.video.strict_rendering_mode) + + if (surface_is_rop_target && g_cfg.video.strict_rendering_mode) { // Framebuffer feedback avoidance. For MSAA, we do not need to make copies; just use the resolve target if (texptr->samples() == 1) { requires_processing = true; } - else + else if (!requires_processing) { // Select resolve target instead of MSAA image access_type = rsx::surface_access::transfer_read; @@ -592,8 +642,15 @@ namespace rsx texptr->memory_barrier(cmd, access_type); auto viewed_surface = texptr->get_surface(access_type); - return { viewed_surface->get_view(encoded_remap, decoded_remap), texture_upload_context::framebuffer_storage, + sampled_image_descriptor result = { viewed_surface->get_view(encoded_remap, decoded_remap), texture_upload_context::framebuffer_storage, texptr->format_class(), scale, rsx::texture_dimension_extended::texture_dimension_2d, surface_is_rop_target, viewed_surface->samples() }; + + if (requires_clip) + { + calculate_sample_clip_parameters(result, position2i(0, 0), size2i(attr.width, attr.height), size2i(surface_width, surface_height)); + } + + return result; } texptr->memory_barrier(cmd, rsx::surface_access::transfer_read); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 62ecfc2e1e42..9d60684d2a0e 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -26,6 +26,7 @@ namespace gl using image_view_type = gl::texture_view*; using image_storage_type = gl::texture; using texture_format = gl::texture::format; + using viewable_image_type = gl::viewable_image*; }; class cached_texture_section : public rsx::cached_texture_section @@ -388,7 +389,7 @@ namespace gl return vram_texture->get_view(remap_encoding, remap); } - gl::texture* get_raw_texture() const + gl::viewable_image* get_raw_texture() const { return managed_texture.get(); } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 5d16b47c4543..e2aace583251 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -28,6 +28,7 @@ namespace vk using image_view_type = vk::image_view*; using image_storage_type = vk::image; using texture_format = VkFormat; + using viewable_image_type = vk::viewable_image*; }; class cached_texture_section : public rsx::cached_texture_section @@ -153,7 +154,7 @@ namespace vk return vram_texture->get_view(0xAAE4, rsx::default_remap_vector); } - vk::image* get_raw_texture() + vk::viewable_image* get_raw_texture() { return managed_texture.get(); } From a3858db76f3fc6b96a0b18b6911c188c42491d3f Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 19:21:24 +0300 Subject: [PATCH 09/25] rsx: Implement atlas trivialization pass - Decompose sequences of copies into the minimally required set. This does 2 things: a. Reduces GPU workload by doing less transfers b. Allows the clipping optimization to kick in and skip a ton of GPU work --- rpcs3/Emu/RSX/Common/texture_cache.h | 51 +++++++++++++++++++++------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index f41e48ee3b54..a10293590832 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -218,24 +218,51 @@ namespace rsx texcoord_xform.clamp = false; } + bool section_fills_target(const copy_region_descriptor& cpy) const + { + return cpy.dst_x == 0 && cpy.dst_y == 0 && + cpy.dst_w == external_subresource_desc.width && cpy.dst_h == external_subresource_desc.height && + cpy.src_w == cpy.dst_w && cpy.src_h == cpy.dst_h; + } + void simplify() { - // Optimizations in the straightforward methods copy_image_static and copy_image_dynamic make them preferred over the atlas method - if (external_subresource_desc.op == deferred_request_command::atlas_gather && - external_subresource_desc.sections_to_copy.size() == 1) + if (external_subresource_desc.op != deferred_request_command::atlas_gather) + { + // Only atlas simplification supported for now + return; + } + + auto& sections = external_subresource_desc.sections_to_copy; + if (sections.size() > 1) { - // Check if the subresource fills the target, if so, change the command to copy_image_static - const auto &cpy = external_subresource_desc.sections_to_copy.front(); - if (cpy.dst_x == 0 && cpy.dst_y == 0 && - cpy.dst_w == external_subresource_desc.width && cpy.dst_h == external_subresource_desc.height && - cpy.src_w == cpy.dst_w && cpy.src_h == cpy.dst_h) + // GPU image copies are expensive, cull unnecessary transfers if possible + for (auto idx = sections.size() - 1; idx >= 1; idx--) { - external_subresource_desc.external_handle = cpy.src; - external_subresource_desc.x = cpy.src_x; - external_subresource_desc.y = cpy.src_y; - external_subresource_desc.op = deferred_request_command::copy_image_static; + if (section_fills_target(sections[idx])) + { + const auto remaining = sections.size() - idx; + std::memcpy( + sections.data(), + §ions[idx], + remaining * sizeof(sections[0]) + ); + sections.resize(remaining); + break; + } } } + + // Optimizations in the straightforward methods copy_image_static and copy_image_dynamic make them preferred over the atlas method + if (sections.size() == 1 && section_fills_target(sections[0])) + { + // Change the command to copy_image_static + const auto cpy = sections[0]; + external_subresource_desc.external_handle = cpy.src; + external_subresource_desc.x = cpy.src_x; + external_subresource_desc.y = cpy.src_y; + external_subresource_desc.op = deferred_request_command::copy_image_static; + } } // Returns true if at least threshold% is covered in pixels From eaaee13201e36ef710a26200affadabe8c7ee9e0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 20:59:32 +0300 Subject: [PATCH 10/25] vk: Do not clear memory if we're going to overwrite it with a copy anyway --- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 55635389da05..5b8c30c7bcff 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -730,15 +730,18 @@ namespace vk VkImageSubresourceRange dst_range = { dst_aspect, 0, 1, 0, 1 }; vk::change_image_layout(cmd, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, dst_range); - if (!(dst_aspect & VK_IMAGE_ASPECT_DEPTH_BIT)) + if (sections_to_copy[0].dst_w != width || sections_to_copy[0].dst_h != height) { - VkClearColorValue clear = {}; - vkCmdClearColorImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); - } - else - { - VkClearDepthStencilValue clear = { 1.f, 0 }; - vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); + if (!(dst_aspect & VK_IMAGE_ASPECT_DEPTH_BIT)) + { + VkClearColorValue clear = {}; + vkCmdClearColorImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); + } + else + { + VkClearDepthStencilValue clear = { 1.f, 0 }; + vkCmdClearDepthStencilImage(cmd, image->value, image->current_layout, &clear, 1, &dst_range); + } } copy_transfer_regions_impl(cmd, image, sections_to_copy); From 6e61e6811d5b57a2772741693e96392b5f070a32 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 21:00:07 +0300 Subject: [PATCH 11/25] rsx: Trivially detect single-image-blit transfer operations --- rpcs3/Emu/RSX/Common/texture_cache.h | 30 ++++++++++++++------ rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 3 +- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index a10293590832..5d3c374b5ddf 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -218,11 +218,15 @@ namespace rsx texcoord_xform.clamp = false; } - bool section_fills_target(const copy_region_descriptor& cpy) const + inline bool section_fills_target(const copy_region_descriptor& cpy) const { return cpy.dst_x == 0 && cpy.dst_y == 0 && - cpy.dst_w == external_subresource_desc.width && cpy.dst_h == external_subresource_desc.height && - cpy.src_w == cpy.dst_w && cpy.src_h == cpy.dst_h; + cpy.dst_w == external_subresource_desc.width && cpy.dst_h == external_subresource_desc.height; + } + + inline bool section_is_transfer_only(const copy_region_descriptor& cpy) const + { + return cpy.src_w == cpy.dst_w && cpy.src_h == cpy.dst_h; } void simplify() @@ -256,12 +260,21 @@ namespace rsx // Optimizations in the straightforward methods copy_image_static and copy_image_dynamic make them preferred over the atlas method if (sections.size() == 1 && section_fills_target(sections[0])) { - // Change the command to copy_image_static const auto cpy = sections[0]; - external_subresource_desc.external_handle = cpy.src; - external_subresource_desc.x = cpy.src_x; - external_subresource_desc.y = cpy.src_y; - external_subresource_desc.op = deferred_request_command::copy_image_static; + if (section_is_transfer_only(cpy)) + { + // Change the command to copy_image_static + external_subresource_desc.external_handle = cpy.src; + external_subresource_desc.x = cpy.src_x; + external_subresource_desc.y = cpy.src_y; + external_subresource_desc.op = deferred_request_command::copy_image_static; + } + else + { + // Blit op is a semantic variant of the copy and atlas ops. + // We can simply reuse the atlas handler for this for now, but this allows simplification. + external_subresource_desc.op = deferred_request_command::blit_image_static; + } } } @@ -1693,6 +1706,7 @@ namespace rsx break; } case deferred_request_command::atlas_gather: + case deferred_request_command::blit_image_static: { result = generate_atlas_from_images(cmd, desc.gcm_format, desc.width, desc.height, desc.sections_to_copy, desc.remap); break; diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 79038b9bd88d..1f7a86824068 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -43,7 +43,8 @@ namespace rsx atlas_gather, // Provided list of sections generates a texture atlas _3d_gather, // Provided list of sections generates a 3D array _3d_unwrap, // One large texture provided to be partitioned into a 3D array - mipmap_gather // Provided list of sections to be reassembled as mipmap levels of the same texture + mipmap_gather, // Provided list of sections to be reassembled as mipmap levels of the same texture + blit_image_static, // Variant of the copy command that does scaling instead of copying }; struct image_section_attributes_t From e50ff86223c4a5917c17a06936f5d26311569a95 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 22:00:54 +0300 Subject: [PATCH 12/25] rsx: Lower single-image-blit to a scaled coordinate fetch --- rpcs3/Emu/RSX/Common/texture_cache.h | 27 +++++++++++++----- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 29 ++++++++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 5d3c374b5ddf..d85f9c5c7d31 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -1972,16 +1972,29 @@ namespace rsx const bool is_simple_subresource_copy = (result.external_subresource_desc.op == deferred_request_command::copy_image_static) || - (result.external_subresource_desc.op == deferred_request_command::copy_image_dynamic); + (result.external_subresource_desc.op == deferred_request_command::copy_image_dynamic) || + (result.external_subresource_desc.op == deferred_request_command::blit_image_static); + // FIXME: We need to check if the formats are compatible here! if (is_simple_subresource_copy && attr.edge_clamped) { - helpers::convert_image_copy_to_clip_descriptor( - result, - position2i(result.external_subresource_desc.x, result.external_subresource_desc.y), - size2i(result.external_subresource_desc.width, result.external_subresource_desc.width), - size2i(result.external_subresource_desc.external_handle->width(), result.external_subresource_desc.external_handle->height()), - encoded_remap, remap, false /*FIXME*/); + if (result.external_subresource_desc.op != deferred_request_command::blit_image_static) [[ likely ]] + { + helpers::convert_image_copy_to_clip_descriptor( + result, + position2i(result.external_subresource_desc.x, result.external_subresource_desc.y), + size2i(result.external_subresource_desc.width, result.external_subresource_desc.height), + size2i(result.external_subresource_desc.external_handle->width(), result.external_subresource_desc.external_handle->height()), + encoded_remap, remap, false /* FIXME */); + } + else + { + helpers::convert_image_blit_to_clip_descriptor( + result, + encoded_remap, + remap, + false /* FIXME */); + } return result; } diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 1f7a86824068..58cda5f0a848 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -544,6 +544,35 @@ namespace rsx calculate_sample_clip_parameters(desc, offset, desired_dimensions, actual_dimensions); } + template + void convert_image_blit_to_clip_descriptor( + sampled_image_descriptor& desc, + u32 encoded_remap, + const texture_channel_remap_t& decoded_remap, + bool cyclic_reference) + { + const auto& section = desc.external_subresource_desc.sections_to_copy[0]; + + // Our "desired" output is the source window, and the "actual" output is the real size + const auto aa_scale_x = section.src->samples() % 2; + const auto aa_scale_y = section.src->samples() / 2; + const auto surface_width = section.src->width() * aa_scale_x; + const auto surface_height = section.src->height() * aa_scale_y; + + // First, we convert this descriptor to a copy descriptor + desc.external_subresource_desc.external_handle = section.src; + + // Now apply conversion + convert_image_copy_to_clip_descriptor( + desc, + position2i(section.src_x, section.src_y), + size2i(section.src_w, section.src_h), + size2i(surface_width, surface_height), + encoded_remap, + decoded_remap, + cyclic_reference); + } + template sampled_image_descriptor process_framebuffer_resource_fast(commandbuffer_type& cmd, render_target_type texptr, From 9f9bb41767f3cbca47033c1fd4f6d092db9b9b7d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 1 Jul 2023 01:45:46 +0300 Subject: [PATCH 13/25] rsx: Formatting and tidying changes --- .../GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl | 6 +++--- .../Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl | 11 ++++++----- rpcs3/Emu/RSX/RSXThread.cpp | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl index 01e83f3c2d3f..cd42b11be2ee 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXFragmentPrologue.glsl @@ -50,7 +50,7 @@ vec4 fetch_fog_value(const in uint mode) // exponential2_abs result.y = exp(-pow(4.709 * (fog_param1 * abs(fog_c.x) + fog_param0 - 1.5), 2.)); break; - case FOG_LINEAR_ABS: + case FOG_LINEAR_ABS: // linear_abs result.y = fog_param1 * abs(fog_c.x) + (fog_param0 - 1.); break; @@ -65,8 +65,8 @@ vec4 fetch_fog_value(const in uint mode) // Purely stochastic bool coverage_test_passes(const in vec4 _sample) { - float random = _rand(gl_FragCoord); - return (_sample.a > random); + float random_val = _rand(gl_FragCoord); + return (_sample.a > random_val); } #endif diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl index c0be5e8786fd..66c4a6072b97 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl @@ -55,7 +55,7 @@ uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w uint gen_bits(const in uint x, const in uint y, const in bool swap) { - return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8); + return (swap) ? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8); } // NOTE: (int(n) or int(n)) is broken on some NVIDIA and INTEL hardware when the sign bit is involved. @@ -159,8 +159,8 @@ attribute_desc fetch_desc(const in int location) uvec2 attrib = texelFetch(vertex_layout_stream, location + int(layout_ptr_offset)).xy; #else // Data is packed into a ubo - int block = (location >> 1); - int sub_block = (location & 1) << 1; + const int block = (location >> 1); + const int sub_block = (location & 1) << 1; uvec2 attrib = uvec2( ref(input_attributes_blob[block], sub_block + 0), ref(input_attributes_blob[block], sub_block + 1)); @@ -180,8 +180,9 @@ attribute_desc fetch_desc(const in int location) vec4 read_location(const in int location) { + int vertex_id; attribute_desc desc = fetch_desc(location); - int vertex_id = _gl_VertexID - int(vertex_base_index); + if (desc.frequency == 0) { vertex_id = 0; @@ -193,7 +194,7 @@ vec4 read_location(const in int location) } else { - vertex_id /= int(desc.frequency); + vertex_id = (_gl_VertexID - int(vertex_base_index)) / int(desc.frequency); } if (desc.is_volatile) diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 4f367fd42211..12437359e038 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2328,7 +2328,7 @@ namespace rsx // Subpixel offset so that (X + bias) * scale will round correctly. // This is done to work around fdiv precision issues in some GPUs (NVIDIA) // We apply the simplification where (x + bias) * z = xz + zbias here. - const auto subpixel_bias = 0.01f; + constexpr auto subpixel_bias = 0.01f; current_fragment_program.texture_params[i].bias[0] += (subpixel_bias * current_fragment_program.texture_params[i].scale[0]); current_fragment_program.texture_params[i].bias[1] += (subpixel_bias * current_fragment_program.texture_params[i].scale[1]); current_fragment_program.texture_params[i].bias[2] += (subpixel_bias * current_fragment_program.texture_params[i].scale[2]); From 41bc9c7b43c4a00452365c14c53caa7494ef4ce9 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 1 Jul 2023 01:49:32 +0300 Subject: [PATCH 14/25] vk: Fix namespace pollution in sync module --- rpcs3/Emu/RSX/VK/vkutils/sync.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp index 187185047b1e..238f5f80b21d 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp @@ -2,6 +2,7 @@ #include "buffer_object.h" #include "commands.h" #include "device.h" +#include "garbage_collector.h" #include "sync.h" #include "shared.h" @@ -10,9 +11,6 @@ #include "util/sysinfo.hpp" #include "util/asm.hpp" -// FIXME: namespace pollution -#include "../VKResourceManager.h" - namespace vk { namespace globals @@ -413,7 +411,7 @@ namespace vk if (m_buffer) { m_buffer->unmap(); - vk::get_resource_manager()->dispose(m_buffer); + vk::get_gc()->dispose(m_buffer); } m_buffer = std::make_unique @@ -537,7 +535,7 @@ namespace vk auto result = std::make_unique(globals::get_shared_marker_pool(dev), message); result->signal(cmd, dependency); - vk::get_resource_manager()->dispose(result); + vk::get_gc()->dispose(result); } debug_marker_scope::debug_marker_scope(const vk::command_buffer& cmd, const std::string& message) From d8e2e48fc5a25c71913bfa651ad0074182dbe84d Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 1 Jul 2023 02:35:52 +0300 Subject: [PATCH 15/25] rsx: Use designated initializer for the dangerous copy_region_descriptor type - We need to move more dangerous braced initializations to c++20 - Also adds a base address variable which will come in handy --- rpcs3/Emu/RSX/Common/texture_cache.h | 40 ++++--- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 115 ++++++++++--------- rpcs3/Emu/RSX/GL/GLTextureCache.cpp | 13 ++- rpcs3/Emu/RSX/GL/GLTextureCache.h | 11 +- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 24 ++-- 5 files changed, 117 insertions(+), 86 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index d85f9c5c7d31..11c88cba566f 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -143,6 +143,7 @@ namespace rsx std::vector sections_to_copy; texture_channel_remap_t remap; deferred_request_command op = deferred_request_command::nop; + u32 external_ref_addr = 0; u16 x = 0; u16 y = 0; @@ -274,6 +275,7 @@ namespace rsx // Blit op is a semantic variant of the copy and atlas ops. // We can simply reuse the atlas handler for this for now, but this allows simplification. external_subresource_desc.op = deferred_request_command::blit_image_static; + external_subresource_desc.external_ref_addr = cpy.base_addr; } } } @@ -1666,13 +1668,18 @@ namespace rsx { sections[n] = { - desc.external_handle, - surface_transform::coordinate_transform, - 0, - 0, static_cast(desc.slice_h * n), - 0, 0, n, - desc.width, desc.height, - desc.width, desc.height + .src = desc.external_handle, + .xform = surface_transform::coordinate_transform, + .level = 0, + .src_x = 0, + .src_y = static_cast(desc.slice_h * n), + .dst_x = 0, + .dst_y = 0, + .dst_z = n, + .src_w = desc.width, + .src_h = desc.height, + .dst_w = desc.width, + .dst_h = desc.height }; } @@ -1692,13 +1699,18 @@ namespace rsx { sections[n] = { - desc.external_handle, - surface_transform::coordinate_transform, - 0, - 0, static_cast(desc.slice_h * n), - 0, 0, n, - desc.width, desc.height, - desc.width, desc.height + .src = desc.external_handle, + .xform = surface_transform::coordinate_transform, + .level = 0, + .src_x = 0, + .src_y = static_cast(desc.slice_h * n), + .dst_x = 0, + .dst_y = 0, + .dst_z = n, + .src_w = desc.width, + .src_h = desc.height, + .dst_w = desc.width, + .dst_h = desc.height }; } diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 58cda5f0a848..ae7ba000ecaa 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -20,6 +20,7 @@ namespace rsx { image_resource_type src; flags32_t xform; + u32 base_addr; u8 level; u16 src_x; u16 src_y; @@ -315,16 +316,19 @@ namespace rsx out.push_back ({ - section.surface->get_surface(rsx::surface_access::transfer_read), - surface_transform::identity, - 0, - static_cast(src_x), - static_cast(src_y), - static_cast(dst_x), - static_cast(dst_y), - slice, - src_width, src_height, - dst_width, dst_height + .src = section.surface->get_surface(rsx::surface_access::transfer_read), + .xform = surface_transform::identity, + .base_addr = section.base_addr, + .level = 0, + .src_x = static_cast(src_x), + .src_y = static_cast(src_y), + .dst_x = static_cast(dst_x), + .dst_y = static_cast(dst_y), + .dst_z = slice, + .src_w = src_width, + .src_h = src_height, + .dst_w = dst_width, + .dst_h = dst_height }); }; @@ -378,36 +382,37 @@ namespace rsx out.push_back ({ - section->get_raw_texture(), - surface_transform::identity, - 0, - static_cast(src_offset.x), // src.x - static_cast(src_offset.y), // src.y - _dst_x, // dst.x - _dst_y, // dst.y - slice, - src_w, - height, - _dst_w, - _dst_h + .src = section->get_raw_texture(), + .xform = surface_transform::identity, + .base_addr = address, + .level = 0, + .src_x = static_cast(src_offset.x), // src.x + .src_y = static_cast(src_offset.y), // src.y + .dst_x = _dst_x, // dst.x + .dst_y = _dst_y, // dst.y + .dst_z = slice, + .src_w = src_w, + .src_h = height, + .dst_w = _dst_w, + .dst_h = _dst_h }); } else { out.push_back ({ - section->get_raw_texture(), - surface_transform::identity, - 0, - static_cast(src_offset.x), // src.x - static_cast(src_offset.y), // src.y - static_cast(dst_offset.x), // dst.x - static_cast(dst_y - dst_slice_begin), // dst.y - 0, - src_w, - height, - dst_w, - height + .src = section->get_raw_texture(), + .xform = surface_transform::identity, + .level = 0, + .src_x = static_cast(src_offset.x), // src.x + .src_y = static_cast(src_offset.y), // src.y + .dst_x = static_cast(dst_offset.x), // dst.x + .dst_y = static_cast(dst_y - dst_slice_begin), // dst.y + .dst_z = 0, + .src_w = src_w, + .src_h = height, + .dst_w = dst_w, + .dst_h = height }); } }; @@ -561,6 +566,7 @@ namespace rsx // First, we convert this descriptor to a copy descriptor desc.external_subresource_desc.external_handle = section.src; + desc.external_subresource_desc.external_ref_addr = section.base_addr; // Now apply conversion convert_image_copy_to_clip_descriptor( @@ -828,12 +834,14 @@ namespace rsx { if (level.image_handle) { - copy_region_descriptor_type mip{}; - mip.src = level.image_handle->image(); - mip.xform = surface_transform::coordinate_transform; - mip.level = mipmap_level; - mip.dst_w = attr.width; - mip.dst_h = attr.height; + copy_region_descriptor_type mip + { + .src = level.image_handle->image(), + .xform = surface_transform::coordinate_transform, + .level = mipmap_level, + .dst_w = attr.width, + .dst_h = attr.height + }; // "Fast" framebuffer results are a perfect match for attr so we do not store transfer sizes // Calculate transfer dimensions from attr @@ -856,18 +864,21 @@ namespace rsx case deferred_request_command::copy_image_dynamic: case deferred_request_command::copy_image_static: { - copy_region_descriptor_type mip{}; - mip.src = level.external_subresource_desc.external_handle; - mip.xform = surface_transform::coordinate_transform; - mip.level = mipmap_level; - mip.dst_w = attr.width; - mip.dst_h = attr.height; - - // NOTE: gather_texture_slices pre-applies resolution scaling - mip.src_x = level.external_subresource_desc.x; - mip.src_y = level.external_subresource_desc.y; - mip.src_w = level.external_subresource_desc.width; - mip.src_h = level.external_subresource_desc.height; + copy_region_descriptor_type mip + { + .src = level.external_subresource_desc.external_handle, + .xform = surface_transform::coordinate_transform, + .level = mipmap_level, + + // NOTE: gather_texture_slices pre-applies resolution scaling + .src_x = level.external_subresource_desc.x, + .src_y = level.external_subresource_desc.y, + .src_w = level.external_subresource_desc.width, + .src_h = level.external_subresource_desc.height, + + .dst_w = attr.width, + .dst_h = attr.height + }; sections.push_back(mip); break; diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp index edf113f12977..e5a288870872 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp @@ -161,11 +161,14 @@ namespace gl { std::vector region = {{ - src, - rsx::surface_transform::coordinate_transform, - 0, - x, y, 0, 0, 0, - width, height, width, height + .src = src, + .xform = rsx::surface_transform::coordinate_transform, + .src_x = x, + .src_y = y, + .src_w = width, + .src_h = height, + .dst_w = width, + .dst_h = height }}; copy_transfer_regions_impl(cmd, dst, region); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 9d60684d2a0e..aee6909a1bca 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -591,11 +591,12 @@ namespace gl { std::vector region = {{ - src, - rsx::surface_transform::identity, - 0, - 0, 0, 0, 0, 0, - width, height, width, height + .src = src, + .xform = rsx::surface_transform::identity, + .src_w = width, + .src_h = height, + .dst_w = width, + .dst_h = height }}; copy_transfer_regions_impl(cmd, dst->image(), region); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 5b8c30c7bcff..c732428daf91 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -612,11 +612,14 @@ namespace vk { std::vector region = { { - source, - rsx::surface_transform::coordinate_transform, - 0, - x, y, 0, 0, 0, - w, h, w, h + .src = source, + .xform = rsx::surface_transform::coordinate_transform, + .src_x = x, + .src_y = y, + .src_w = w, + .src_h = h, + .dst_w = w, + .dst_h = h } }; vk::change_image_layout(cmd, image.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); @@ -800,11 +803,12 @@ namespace vk { std::vector region = { { - src, - rsx::surface_transform::identity, - 0, - 0, 0, 0, 0, 0, - width, height, width, height + .src = src, + .xform = rsx::surface_transform::identity, + .src_w = width, + .src_h = height, + .dst_w = width, + .dst_h = height } }; auto dst = dst_view->image(); From 1ed1ff7a4b9eae847722987b2b27cea6dde81974 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 1 Jul 2023 03:22:12 +0300 Subject: [PATCH 16/25] rsx: Properly fill in the cyclic ref field for framebuffer references --- rpcs3/Emu/RSX/Common/texture_cache.h | 8 +++++--- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 11c88cba566f..13c8ddb46766 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -262,6 +262,8 @@ namespace rsx if (sections.size() == 1 && section_fills_target(sections[0])) { const auto cpy = sections[0]; + external_subresource_desc.external_ref_addr = cpy.base_addr; + if (section_is_transfer_only(cpy)) { // Change the command to copy_image_static @@ -275,7 +277,6 @@ namespace rsx // Blit op is a semantic variant of the copy and atlas ops. // We can simply reuse the atlas handler for this for now, but this allows simplification. external_subresource_desc.op = deferred_request_command::blit_image_static; - external_subresource_desc.external_ref_addr = cpy.base_addr; } } } @@ -1997,7 +1998,7 @@ namespace rsx position2i(result.external_subresource_desc.x, result.external_subresource_desc.y), size2i(result.external_subresource_desc.width, result.external_subresource_desc.height), size2i(result.external_subresource_desc.external_handle->width(), result.external_subresource_desc.external_handle->height()), - encoded_remap, remap, false /* FIXME */); + encoded_remap, remap, false); } else { @@ -2005,9 +2006,10 @@ namespace rsx result, encoded_remap, remap, - false /* FIXME */); + false); } + result.is_cyclic_reference = !!result.ref_address && m_rtts.address_is_bound(result.ref_address); return result; } diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index ae7ba000ecaa..37a2371afe93 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -318,7 +318,7 @@ namespace rsx ({ .src = section.surface->get_surface(rsx::surface_access::transfer_read), .xform = surface_transform::identity, - .base_addr = section.base_addr, + .base_addr = section.base_address, .level = 0, .src_x = static_cast(src_x), .src_y = static_cast(src_y), @@ -384,7 +384,6 @@ namespace rsx ({ .src = section->get_raw_texture(), .xform = surface_transform::identity, - .base_addr = address, .level = 0, .src_x = static_cast(src_offset.x), // src.x .src_y = static_cast(src_offset.y), // src.y @@ -542,6 +541,7 @@ namespace rsx bool cyclic_reference) { desc.image_handle = desc.external_subresource_desc.as_viewable()->get_view(encoded_remap, decoded_remap); + desc.ref_address = desc.external_subresource_desc.external_ref_addr; desc.is_cyclic_reference = cyclic_reference; desc.samples = desc.external_subresource_desc.external_handle->samples(); desc.external_subresource_desc = {}; From e668a7af4c38b2f9137bfd076a95f227b836826e Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 2 Jul 2023 04:16:59 +0300 Subject: [PATCH 17/25] rsx: Force compatibility checks when applying fast-path optimizations --- rpcs3/Emu/RSX/Common/texture_cache.h | 21 +++++++++++++++++++-- rpcs3/Emu/RSX/GL/GLTextureCache.h | 4 ++-- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 13c8ddb46766..fa330457b69d 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -168,6 +168,22 @@ namespace rsx { return static_cast(external_handle); } + + image_resource_type src0() const + { + if (external_handle) + { + return external_handle; + } + + if (!sections_to_copy.empty()) + { + return sections_to_copy[0].src; + } + + // Return typed null + return external_handle; + } }; struct sampled_image_descriptor : public sampled_image_descriptor_base @@ -1988,8 +2004,9 @@ namespace rsx (result.external_subresource_desc.op == deferred_request_command::copy_image_dynamic) || (result.external_subresource_desc.op == deferred_request_command::blit_image_static); - // FIXME: We need to check if the formats are compatible here! - if (is_simple_subresource_copy && attr.edge_clamped) + if (is_simple_subresource_copy && + attr.edge_clamped && + render_target_format_is_compatible(result.external_subresource_desc.src0(), attr.gcm_format)) { if (result.external_subresource_desc.op != deferred_request_command::blit_image_static) [[ likely ]] { diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index aee6909a1bca..1503c62a9412 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -765,8 +765,8 @@ namespace gl switch (gcm_format) { default: - //TODO - // warn_once("Format incompatibility detected, reporting failure to force data copy (GL_INTERNAL_FORMAT=0x%X, GCM_FORMAT=0x%X)", static_cast(ifmt), gcm_format); + // TODO + err_once("Format incompatibility detected, reporting failure to force data copy (GL_INTERNAL_FORMAT=0x%X, GCM_FORMAT=0x%X)", static_cast(ifmt), gcm_format); return false; case CELL_GCM_TEXTURE_W16_Z16_Y16_X16_FLOAT: return (ifmt == gl::texture::internal_format::rgba16f); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index c732428daf91..b0344fc9e677 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -1112,7 +1112,7 @@ namespace vk { default: //TODO - // warn_once("Format incompatibility detected, reporting failure to force data copy (VK_FORMAT=0x%X, GCM_FORMAT=0x%X)", static_cast(vk_format), gcm_format); + err_once("Format incompatibility detected, reporting failure to force data copy (VK_FORMAT=0x%X, GCM_FORMAT=0x%X)", static_cast(vk_format), gcm_format); return false; #ifndef __APPLE__ case CELL_GCM_TEXTURE_R5G6B5: From f14beb4bd2d51cd816ad8372f01b1cd38cb1da59 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 20:29:13 +0300 Subject: [PATCH 18/25] rsx: Insert missing texture barrier --- rpcs3/Emu/RSX/Common/texture_cache.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index fa330457b69d..19eb1c21afac 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -2026,7 +2026,14 @@ namespace rsx false); } - result.is_cyclic_reference = !!result.ref_address && m_rtts.address_is_bound(result.ref_address); + if (!!result.ref_address && m_rtts.address_is_bound(result.ref_address)) + { + result.is_cyclic_reference = true; + + auto texptr = ensure(m_rtts.get_surface_at(result.ref_address)); + insert_texture_barrier(cmd, texptr); + } + return result; } From 6d1e25d6f5f6a85d4064a8e5c0d7b832694f23dd Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 20:57:57 +0300 Subject: [PATCH 19/25] rsx: Fix sampled descriptor address corruption --- rpcs3/Emu/RSX/Common/texture_cache.h | 11 ++++++++--- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 4 ++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 19eb1c21afac..8e8cbe1912a2 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -2004,8 +2004,9 @@ namespace rsx (result.external_subresource_desc.op == deferred_request_command::copy_image_dynamic) || (result.external_subresource_desc.op == deferred_request_command::blit_image_static); - if (is_simple_subresource_copy && - attr.edge_clamped && + if (attr.edge_clamped && + !g_cfg.video.strict_rendering_mode && + is_simple_subresource_copy && render_target_format_is_compatible(result.external_subresource_desc.src0(), attr.gcm_format)) { if (result.external_subresource_desc.op != deferred_request_command::blit_image_static) [[ likely ]] @@ -2327,7 +2328,11 @@ namespace rsx result.external_subresource_desc.cache_range = lookup_range; } - result.ref_address = attributes.address; + if (!result.ref_address) + { + result.ref_address = attributes.address; + } + result.surface_cache_tag = m_rtts.write_tag; if (subsurface_count == 1) diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 37a2371afe93..52a73a5bdfb9 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -646,9 +646,13 @@ namespace rsx { // If we can get away with clip only, do it if (attr.edge_clamped) + { requires_clip = true; + } else + { requires_processing = true; + } } if (surface_is_rop_target && g_cfg.video.strict_rendering_mode) From da1ac18f12c5ffa6abadce443e7fbc49c7887571 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 21:29:01 +0300 Subject: [PATCH 20/25] vk/gl: Fix debug overlay stats --- rpcs3/Emu/RSX/Common/texture_cache.h | 11 +++++++++++ rpcs3/Emu/RSX/GL/GLPresent.cpp | 7 ++++--- rpcs3/Emu/RSX/VK/VKPresent.cpp | 5 +++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 8e8cbe1912a2..d0c6c73ff77d 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -452,6 +452,7 @@ namespace rsx atomic_t m_unavoidable_hard_faults_this_frame = { 0 }; atomic_t m_texture_upload_calls_this_frame = { 0 }; atomic_t m_texture_upload_misses_this_frame = { 0 }; + atomic_t m_texture_copies_ellided_this_frame = { 0 }; static const u32 m_predict_max_flushes_per_frame = 50; // Above this number the predictions are disabled // Invalidation @@ -2327,6 +2328,10 @@ namespace rsx // Deferred reconstruct result.external_subresource_desc.cache_range = lookup_range; } + else if (result.texcoord_xform.clamp) + { + m_texture_copies_ellided_this_frame++; + } if (!result.ref_address) { @@ -3421,6 +3426,7 @@ namespace rsx m_unavoidable_hard_faults_this_frame.store(0u); m_texture_upload_calls_this_frame.store(0u); m_texture_upload_misses_this_frame.store(0u); + m_texture_copies_ellided_this_frame.store(0u); } void on_flush() @@ -3503,5 +3509,10 @@ namespace rsx { return (m_texture_upload_calls_this_frame)? (m_texture_upload_misses_this_frame * 100 / m_texture_upload_calls_this_frame) : 0; } + + u32 get_texture_copies_ellided_this_frame() const + { + return m_texture_copies_ellided_this_frame; + } }; } diff --git a/rpcs3/Emu/RSX/GL/GLPresent.cpp b/rpcs3/Emu/RSX/GL/GLPresent.cpp index 82cfb6cf4c51..b99a230c2a7f 100644 --- a/rpcs3/Emu/RSX/GL/GLPresent.cpp +++ b/rpcs3/Emu/RSX/GL/GLPresent.cpp @@ -342,7 +342,7 @@ void GLGSRender::flip(const rsx::display_flip_info_t& info) int y_loc = 0; const auto println = [&](const std::string& text) { - m_text_printer.print_text(cmd, 4, 0, width, height, text); + m_text_printer.print_text(cmd, 4, y_loc, width, height, text); y_loc += 16; }; @@ -364,17 +364,18 @@ void GLGSRender::flip(const rsx::display_flip_info_t& info) const auto num_texture_upload = m_gl_texture_cache.get_texture_upload_calls_this_frame(); const auto num_texture_upload_miss = m_gl_texture_cache.get_texture_upload_misses_this_frame(); const auto texture_upload_miss_ratio = m_gl_texture_cache.get_texture_upload_miss_percentage(); + const auto texture_copies_ellided = m_gl_texture_cache.get_texture_copies_ellided_this_frame(); println(fmt::format("Unreleased textures: %7d", num_dirty_textures)); println(fmt::format("Texture memory: %12dM", texture_memory_size)); println(fmt::format("Flush requests: %12d = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate)); - println(fmt::format("Texture uploads: %15u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio)); + println(fmt::format("Texture uploads: %11u (%u from CPU - %02u%%, %u copies avoided)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio, texture_copies_ellided)); const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count); const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count ? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count : 0; - println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio)); + println(fmt::format("Vertex cache hits: %9u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio)); } if (gl::debug::g_vis_texture) diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index 09d422dd2008..9ff2dee4f780 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -801,18 +801,19 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info) const auto num_texture_upload = m_texture_cache.get_texture_upload_calls_this_frame(); const auto num_texture_upload_miss = m_texture_cache.get_texture_upload_misses_this_frame(); const auto texture_upload_miss_ratio = m_texture_cache.get_texture_upload_miss_percentage(); + const auto texture_copies_ellided = m_texture_cache.get_texture_copies_ellided_this_frame(); println(fmt::format("Unreleased textures: %8d", num_dirty_textures)); println(fmt::format("Texture cache memory: %7dM", texture_memory_size)); println(fmt::format("Temporary texture memory: %3dM", tmp_texture_memory_size)); println(fmt::format("Flush requests: %13d = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate)); - println(fmt::format("Texture uploads: %14u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio)); + println(fmt::format("Texture uploads: %12u (%u from CPU - %02u%%, %u copies avoided)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio, texture_copies_ellided)); const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count); const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count ? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count : 0; - println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio)); + println(fmt::format("Vertex cache hits: %10u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio)); } direct_fbo->release(); From cb7734b86c7ad7237ce4f6e28cb23b069fd43ccd Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 21:29:23 +0300 Subject: [PATCH 21/25] rsx: Fix vp codegen when unrestricted depth range extension is absent --- .../Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl index 334280265215..fd8c9f596dd1 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl @@ -52,7 +52,7 @@ vec4 apply_zclip_xform( } return vec4(pos.x, pos.y, d * pos.w, pos.w); -}\n +} #endif )" From e10d36156031e63d980afe985f833b96a965b994 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 22:07:31 +0300 Subject: [PATCH 22/25] rsx: Fix shader interpreter compilation --- .../GLSLInterpreter/FragmentInterpreter.glsl | 105 ++++++++++++++---- 1 file changed, 86 insertions(+), 19 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl b/rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl index 8229eb42ee42..ab799b8c80fa 100644 --- a/rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLInterpreter/FragmentInterpreter.glsl @@ -298,6 +298,57 @@ bool check_cond() #ifdef WITH_TEXTURES +#define RSX_SAMPLE_TEXTURE_1D 0 +#define RSX_SAMPLE_TEXTURE_2D 1 +#define RSX_SAMPLE_TEXTURE_CUBE 2 +#define RSX_SAMPLE_TEXTURE_3D 3 + +// FIXME: Remove when codegen is unified +#define CLAMP_COORDS_BIT 17 + +float _texcoord_xform(const in float coord, const in sampler_info params) +{ + float result = fma(coord, params.scale_x, params.bias_x); + if (TEST_BIT(params.flags, CLAMP_COORDS_BIT)) + { + result = clamp(result, params.clamp_min_x, params.clamp_max_x); + } + + return result; +} + +vec2 _texcoord_xform(const in vec2 coord, const in sampler_info params) +{ + vec2 result = fma( + coord, + vec2(params.scale_x, params.scale_y), + vec2(params.bias_x, params.bias_y) + ); + + if (TEST_BIT(params.flags, CLAMP_COORDS_BIT)) + { + result = clamp( + result, + vec2(params.clamp_min_x, params.clamp_min_y), + vec2(params.clamp_max_x, params.clamp_max_y) + ); + } + + return result; +} + +vec3 _texcoord_xform(const in vec3 coord, const in sampler_info params) +{ + vec3 result = fma( + coord, + vec3(params.scale_x, params.scale_y, params.scale_z), + vec3(params.bias_x, params.bias_y, params.bias_z) + ); + + // NOTE: Coordinate clamping not supported for CUBE and 3D textures + return result; +} + vec4 _texture(in vec4 coord, float bias) { ur0 = GET_BITS(0, 17, 4); @@ -308,18 +359,25 @@ vec4 _texture(in vec4 coord, float bias) ur1 = ur0 + ur0; const uint type = bitfieldExtract(texture_control, int(ur1), 2); - coord.xyz = (coord.xyz + texture_parameters[ur0].scale_bias.w) * texture_parameters[ur0].scale_bias.xyz; switch (type) { - case 0: - vr0 = texture(SAMPLER1D(ur0), coord.x, bias); break; - case 1: - vr0 = texture(SAMPLER2D(ur0), coord.xy, bias); break; - case 2: - vr0 = texture(SAMPLERCUBE(ur0), coord.xyz, bias); break; - case 3: - vr0 = texture(SAMPLER3D(ur0), coord.xyz, bias); break; + case RSX_SAMPLE_TEXTURE_1D: + coord.x = _texcoord_xform(coord.x, texture_parameters[ur0]); + vr0 = texture(SAMPLER1D(ur0), coord.x, bias); + break; + case RSX_SAMPLE_TEXTURE_2D: + coord.xy = _texcoord_xform(coord.xy, texture_parameters[ur0]); + vr0 = texture(SAMPLER2D(ur0), coord.xy, bias); + break; + case RSX_SAMPLE_TEXTURE_CUBE: + coord.xyz = _texcoord_xform(coord.xyz, texture_parameters[ur0]); + vr0 = texture(SAMPLERCUBE(ur0), coord.xyz, bias); + break; + case RSX_SAMPLE_TEXTURE_3D: + coord.xyz = _texcoord_xform(coord.xyz, texture_parameters[ur0]); + vr0 = texture(SAMPLER3D(ur0), coord.xyz, bias); + break; } if (TEST_BIT(0, 21)) @@ -340,23 +398,32 @@ vec4 _textureLod(in vec4 coord, float lod) ur1 = ur0 + ur0; const uint type = bitfieldExtract(texture_control, int(ur1), 2); - coord.xyz = (coord.xyz + texture_parameters[ur0].scale_bias.w) * texture_parameters[ur0].scale_bias.xyz; switch (type) { - case 0: - vr0 = textureLod(SAMPLER1D(ur0), coord.x, lod); break; - case 1: - vr0 = textureLod(SAMPLER2D(ur0), coord.xy, lod); break; - case 2: - vr0 = textureLod(SAMPLERCUBE(ur0), coord.xyz, lod); break; - case 3: - vr0 = textureLod(SAMPLER3D(ur0), coord.xyz, lod); break; + case RSX_SAMPLE_TEXTURE_1D: + coord.x = _texcoord_xform(coord.x, texture_parameters[ur0]); + vr0 = textureLod(SAMPLER1D(ur0), coord.x, lod); + break; + case RSX_SAMPLE_TEXTURE_2D: + coord.xy = _texcoord_xform(coord.xy, texture_parameters[ur0]); + vr0 = textureLod(SAMPLER2D(ur0), coord.xy, lod); + break; + case RSX_SAMPLE_TEXTURE_CUBE: + coord.xyz = _texcoord_xform(coord.xyz, texture_parameters[ur0]); + vr0 = textureLod(SAMPLERCUBE(ur0), coord.xyz, lod); + break; + case RSX_SAMPLE_TEXTURE_3D: + coord.xyz = _texcoord_xform(coord.xyz, texture_parameters[ur0]); + vr0 = textureLod(SAMPLER3D(ur0), coord.xyz, lod); + break; } if (TEST_BIT(0, 21)) { - vr0 = vr0 * 2. - 1.; + // Normal-expand, v = 2v - 1 + vr0 += vr0; + vr0 -= 1.; } return vr0; From 024e90ca24a450de2ccc56f18ae7c079dabefaa6 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 3 Jul 2023 22:11:16 +0300 Subject: [PATCH 23/25] rsx: Fix shader interpreter - It broke ages ago --- rpcs3/Emu/RSX/Common/bitfield.hpp | 5 +++++ rpcs3/Emu/RSX/GL/GLDraw.cpp | 2 +- rpcs3/Emu/RSX/VK/VKDraw.cpp | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/bitfield.hpp b/rpcs3/Emu/RSX/Common/bitfield.hpp index d3c1f9d2caf7..35906909d2ff 100644 --- a/rpcs3/Emu/RSX/Common/bitfield.hpp +++ b/rpcs3/Emu/RSX/Common/bitfield.hpp @@ -107,6 +107,11 @@ namespace rsx public: bitmask_t() = default; + bitmask_type load() const + { + return m_data; + } + bool operator & (bitmask_type mask) const { return !!(m_data & mask); diff --git a/rpcs3/Emu/RSX/GL/GLDraw.cpp b/rpcs3/Emu/RSX/GL/GLDraw.cpp index 31eb2c80b0c1..360c252a83d8 100644 --- a/rpcs3/Emu/RSX/GL/GLDraw.cpp +++ b/rpcs3/Emu/RSX/GL/GLDraw.cpp @@ -649,7 +649,7 @@ void GLGSRender::emit_geometry(u32 sub_index) void GLGSRender::begin() { // Save shader state now before prefetch and loading happens - m_interpreter_state = (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits); + m_interpreter_state = (m_graphics_state.load() & rsx::pipeline_state::invalidate_pipeline_bits); rsx::thread::begin(); diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index bca96c887d63..3b9c11fb1991 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -923,7 +923,7 @@ void VKGSRender::emit_geometry(u32 sub_index) void VKGSRender::begin() { // Save shader state now before prefetch and loading happens - m_interpreter_state = (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits); + m_interpreter_state = (m_graphics_state.load() & rsx::pipeline_state::invalidate_pipeline_bits); rsx::thread::begin(); From 4529ea581cda450bc21afb9fcb363954287423f0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 4 Jul 2023 00:50:25 +0300 Subject: [PATCH 24/25] vk: Disable anisotropy if the anisotropic level is meaningless --- rpcs3/Emu/RSX/VK/vkutils/sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/VK/vkutils/sampler.cpp b/rpcs3/Emu/RSX/VK/vkutils/sampler.cpp index 66575a5f2663..92aee578bc1b 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sampler.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/sampler.cpp @@ -51,7 +51,7 @@ namespace vk info.addressModeU = clamp_u; info.addressModeV = clamp_v; info.addressModeW = clamp_w; - info.anisotropyEnable = dev.get_anisotropic_filtering_support(); + info.anisotropyEnable = max_anisotropy >= 2. && dev.get_anisotropic_filtering_support(); info.compareEnable = depth_compare; info.unnormalizedCoordinates = unnormalized_coordinates; info.mipLodBias = mipLodBias; From 2bbe71137a8118825ab4e8c84a1ea52308f9879e Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 4 Jul 2023 00:50:44 +0300 Subject: [PATCH 25/25] rsx: Fix virtual coordinate clamping --- rpcs3/Emu/RSX/Common/texture_cache_helpers.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h index 52a73a5bdfb9..9eb7157ad2be 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_helpers.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_helpers.h @@ -518,6 +518,8 @@ namespace rsx const f32 scale_y = f32(desired_dimensions.height) / actual_dimensions.height; const f32 offset_x = f32(offset.x) / actual_dimensions.width; const f32 offset_y = f32(offset.y) / actual_dimensions.height; + const f32 limit_x = f32(offset.x + desired_dimensions.width - 1) / actual_dimensions.width; + const f32 limit_y = f32(offset.y + desired_dimensions.height - 1) / actual_dimensions.height; desc.texcoord_xform.scale[0] *= scale_x; desc.texcoord_xform.scale[1] *= scale_y; @@ -525,8 +527,8 @@ namespace rsx desc.texcoord_xform.bias[1] += offset_y; desc.texcoord_xform.clamp_min[0] = offset_x; desc.texcoord_xform.clamp_min[1] = offset_y; - desc.texcoord_xform.clamp_max[0] = offset_x + scale_x; - desc.texcoord_xform.clamp_max[1] = offset_y + scale_y; + desc.texcoord_xform.clamp_max[0] = limit_x; + desc.texcoord_xform.clamp_max[1] = limit_y; desc.texcoord_xform.clamp = true; }