Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gl: Avoid UBO/SSBO binding index collisions #12676

Merged
merged 2 commits into from
Sep 18, 2022

Conversation

kd-11
Copy link
Contributor

@kd-11 kd-11 commented Sep 18, 2022

  • Some drivers don't like this. Actually only Radeonsi. For some reason their slots aren't duplicated across different targets, so writing slot 0 UBO erases slot 0 SSBO and vice-versa 🤦‍♂️
  • Almost all GPUs going back 15 years have a large number of UBO slots but limited SSBO slots. Move UBO slots up as we have tons more headroom there. e.g The 6600M has like 75 UBO slots but only 8 SSBO slots.

Fixes #12476

@Darkhost1999
Copy link
Contributor

Just testing NFS Rivals. Haven't tried anything else yet
Master
RPCS3.log
PR
RPCS3.zip

Really long error

·F 0:00:50.146410 {RSX [0x001fd40]} RSX: Compilation failed: 0(8) : error C3012: invalid value '8' for layout qualifier 'binding'
0(25) : error C1154: non constant expression in layout value

source:
#version 450
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

#define IMAGE_LOCATION(x) (x + 8)
#define SSBO_LOCATION(x) (x + 2)

layout(binding=IMAGE_LOCATION(0)) uniform writeonly restrict image2D output2D;

#define FMT_GL_RGBA8 0x8058
#define FMT_GL_BGRA8 0x80E1
#define FMT_GL_R8 0x8229
#define FMT_GL_R16 0x822A
#define FMT_GL_R32F 0x822E
#define FMT_GL_RG8 0x822B
#define FMT_GL_RG8_SNORM 0x8F95
#define FMT_GL_RG16 0x822C
#define FMT_GL_RG16F 0x822F
#define FMT_GL_RGBA16F 0x881A
#define FMT_GL_RGBA32F 0x8814

#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8
#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24

layout(binding=SSBO_LOCATION(0), std430) readonly restrict buffer RawDataBlock
{
uint data[];
};

#if USE_UBO
layout(%push_block) uniform UnpackConfiguration
{
uint swap_bytes;
uint src_pitch;
uint format;
uint reserved;
ivec2 region_offset;
ivec2 region_size;
};
#else
uniform uint swap_bytes;
uniform uint src_pitch;
uniform uint format;
uniform ivec2 region_offset;
uniform ivec2 region_size;
#endif

uint linear_invocation_id()
{
uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);
return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;
}

ivec2 linear_id_to_output_coord(uint index)
{
return ivec2(int(index % src_pitch), int(index / src_pitch));
}

// Decoders. Beware of multi-wide swapped types (e.g swap(16x2) != swap(32x1))
uint readUint8(const in uint address)
{
const uint block = address / 4;
const uint offset = address % 4;
return bitfieldExtract(data[block], int(offset) * 8, 8);
}

uint readUint16(const in uint address)
{
const uint block = address / 2;
const uint offset = address % 2;
const uint value = bitfieldExtract(data[block], int(offset) * 16, 16);

if (swap_bytes != 0)
{
	return bswap_u16(value);
}

return value;

}

uint readUint32(const in uint address)
{
const uint value = data[address];
return (swap_bytes != 0) ? bswap_u32(value) : value;
}

uvec2 readUint8x2(const in uint address)
{
const uint raw = readUint16(address);
return uvec2(bitfieldExtract(raw, 0, 8), bitfieldExtract(raw, 8, 8));
}

ivec2 readInt8x2(const in uint address)
{
const ivec2 raw = ivec2(readUint8x2(address));
return raw - (ivec2(greaterThan(raw, ivec2(127))) * 256);
}

#define readFixed8(address) readUint8(address) / 255.f
#define readFixed8x2(address) readUint8x2(address) / 255.f
#define readFixed8x2Snorm(address) readInt8x2(address) / 127.f

vec4 readFixed8x4(const in uint address)
{
const uint raw = readUint32(address);
return uvec4(
bitfieldExtract(raw, 0, 8),
bitfieldExtract(raw, 8, 8),
bitfieldExtract(raw, 16, 8),
bitfieldExtract(raw, 24, 8)
) / 255.f;
}

#define readFixed16(address) readUint16(uint(address)) / 65535.f
#define readFixed16x2(address) vec2(readFixed16(address * 2 + 0), readFixed16(address * 2 + 1))
#define readFixed16x4(address) vec4(readFixed16(address * 4 + 0), readFixed16(address * 4 + 1), readFixed16(address * 4 + 2), readFixed16(address * 4 + 3))

#define readFloat16(address) unpackHalf2x16(readUint16(uint(address))).x
#define readFloat16x2(address) vec2(readFloat16(address * 2 + 0), readFloat16(address * 2 + 1))
#define readFloat16x4(address) vec4(readFloat16(address * 4 + 0), readFloat16(address * 4 + 1), readFloat16(address * 4 + 2), readFloat16(address * 4 + 3))

#define readFloat32(address) uintBitsToFloat(readUint32(address))
#define readFloat32x4(address) uintBitsToFloat(uvec4(readUint32(address * 4 + 0), readUint32(address * 4 + 1), readUint32(address * 4 + 2), readUint32(address * 4 + 3)))

#define KERNEL_SIZE 8

void write_output(const in uint invocation_id)
{
vec4 outColor;
uint utmp;

switch (format)
{
// Simple color
case FMT_GL_RGBA8:
	outColor = readFixed8x4(invocation_id);
	break;
case FMT_GL_BGRA8:
	outColor = readFixed8x4(invocation_id).bgra;
	break;
case FMT_GL_R8:
	outColor.r = readFixed8(invocation_id);
	break;
case FMT_GL_R16:
	outColor.r = readFixed16(invocation_id);
	break;
case FMT_GL_R32F:
	outColor.r = readFloat32(invocation_id);
	break;
case FMT_GL_RG8:
	outColor.rg = readFixed8x2(invocation_id);
	break;
case FMT_GL_RG8_SNORM:
	outColor.rg = readFixed8x2Snorm(invocation_id);
	break;
case FMT_GL_RG16:
	outColor.rg = readFixed16x2(invocation_id);
	break;
case FMT_GL_RG16F:
	outColor.rg = readFloat16x2(invocation_id);
	break;
case FMT_GL_RGBA16F:
	outColor = readFloat16x4(invocation_id);
	break;
case FMT_GL_RGBA32F:
	outColor = readFloat32x4(invocation_id);
	break;
}

const ivec2 coord = linear_id_to_output_coord(invocation_id);
if (any(greaterThan(coord, region_size)))
{
	return;
}

imageStore(output2D, coord + region_offset, outColor);

}

void main()
{
uint index = linear_invocation_id() * KERNEL_SIZE;

for (int loop = 0; loop < KERNEL_SIZE; ++loop, ++index)
{
	write_output(index);
}

}

·W 0:00:50.146754 {RSX [0x001fd40]} SYS: Emulation has been frozen! You can either use debugger tools to inspect current emulation state or terminate it.
·F 0:00:50.147796 {RSX [0x001fd40]} RSX: Linkage failed: Compute info

@Darkhost1999
Copy link
Contributor

Darkhost1999 commented Sep 18, 2022

K everything with OpenGL is outputting that error right after SPU cache before anything related to the game

- Some drivers don't like this. Actually only RADV.
- Almost all GPUs going back 15 years have a large number of UBO slots but limited SSBO slots.
  Move UBO slots up as we have tons more headroom there.
NVIDIA only supports 8 compute image slots even on modern GPUs.
@kd-11 kd-11 force-pushed the ogl-regression-fix branch from 446e9fd to 68e8ee4 Compare September 18, 2022 21:32
@kd-11
Copy link
Contributor Author

kd-11 commented Sep 18, 2022

NVIDIA only supports 8 compute image slots. Fixed now.

@Darkhost1999
Copy link
Contributor

I was reading about that only has 8. I was actually being directed towards that not being true however when I was reading and don't know where to confirm.

@kd-11
Copy link
Contributor Author

kd-11 commented Sep 18, 2022

https://opengl.gpuinfo.org/displayreport.php?id=7899 GL_MAX_COMPUTE_IMAGE_UNIFORMS = 8
image

@kd-11
Copy link
Contributor Author

kd-11 commented Sep 18, 2022

AMD always has 32 since GCN1 so the limit of 8 for new NVIDIA cards was unexpected.

@kd-11 kd-11 merged commit 79f2c21 into RPCS3:master Sep 18, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Regression: Broken graphics in OpenGL on NieR Replicant (#12454)
3 participants