feat: add ssgi and terrain occlusion (#255)

doodlum · Apr 8, 2024 · 35e6bd3 · 35e6bd3
1 parent 00ddda5
commit 35e6bd3
Show file tree

Hide file tree

Showing 30 changed files with 3,507 additions and 259 deletions.
diff --git a/...s/Shaders/CloudShadows/CloudShadows.hlsli → ...adows/Shaders/CloudShadows/output.cs.hlsl b/...s/Shaders/CloudShadows/CloudShadows.hlsli → ...adows/Shaders/CloudShadows/output.cs.hlsl
@@ -1,3 +1,6 @@
+#include "../Common/DeferredShared.hlsl"
+#include "../Common/VR.hlsl"
+
 struct PerPassCloudShadow
 {
 	uint EnableCloudShadows;
@@ -8,13 +11,17 @@ struct PerPassCloudShadow
 	float EffectMix;
 
 	float TransparencyPower;
-	float AbsorptionAmbient;
 
 	float RcpHPlusR;
 };
 
-StructuredBuffer<PerPassCloudShadow> perPassCloudShadow : register(t23);
-TextureCube<float4> cloudShadows : register(t40);
+StructuredBuffer<PerPassCloudShadow> perPassCloudShadow : register(t0);
+TextureCube<float4> cloudShadows : register(t1);
+Texture2D<unorm half> TexDepth : register(t2);
+
+RWTexture2D<unorm float> RWTexShadowMask : register(u0);
+
+SamplerState defaultSampler;
 
 float3 getCloudShadowSampleDir(float3 rel_pos, float3 eye_to_sun)
 {
@@ -38,13 +45,39 @@ float3 getCloudShadowSampleDirFlatEarth(float3 rel_pos, float3 eye_to_sun)
 	return v;
 }
 
-float3 getCloudShadowMult(float3 rel_pos, float3 eye_to_sun, SamplerState samp)
+float3 getCloudShadowMult(float3 rel_pos, float3 eye_to_sun)
 {
 	// float3 cloudSampleDir = getCloudShadowSampleDirFlatEarth(rel_pos, eye_to_sun).xyz;
 	float3 cloudSampleDir = getCloudShadowSampleDir(rel_pos, eye_to_sun).xyz;
 
-	float4 cloudCubeSample = cloudShadows.Sample(samp, cloudSampleDir);
+	float4 cloudCubeSample = cloudShadows.SampleLevel(defaultSampler, cloudSampleDir, 0);  // TODO Sample in pixel shader
 	float alpha = pow(saturate(cloudCubeSample.w), perPassCloudShadow[0].TransparencyPower);
 
 	return lerp(1.0, 1.0 - alpha, perPassCloudShadow[0].EffectMix);
+}
+
+[numthreads(32, 32, 1)] void main(uint2 dtid : SV_DispatchThreadID) {
+	float2 uv = (dtid + .5) * RcpBufferDim;
+#ifdef VR
+	const uint eyeIndex = uv > .5;
+#else
+	const uint eyeIndex = 0;
+#endif
+
+	float3 ndc = float3(ConvertToStereoUV(uv, eyeIndex), 1);
+	ndc = ndc * 2 - 1;
+	ndc.y = -ndc.y;
+	ndc.z = TexDepth[dtid];
+
+	if (ndc.z > 0.9999)
+		return;
+
+	float4 worldPos = mul(InvViewMatrix[eyeIndex], mul(InvProjMatrix[eyeIndex], float4(ndc, 1)));
+	worldPos.xyz /= worldPos.w;
+
+	float3 dirLightDirWS = mul((float3x3)InvViewMatrix[eyeIndex], DirLightDirectionVS[eyeIndex].xyz);
+	float cloudShadow = getCloudShadowMult(worldPos.xyz, dirLightDirWS);
+
+	half shadow = RWTexShadowMask[dtid];
+	RWTexShadowMask[dtid] = shadow * cloudShadow;
 }
diff --git a/features/Screen Space GI/Shaders/Features/ScreenSpaceGI.ini b/features/Screen Space GI/Shaders/Features/ScreenSpaceGI.ini
@@ -0,0 +1,2 @@
+[Info]
+Version = 2-9-0
diff --git a/features/Screen Space GI/Shaders/ScreenSpaceGI/common.hlsli b/features/Screen Space GI/Shaders/ScreenSpaceGI/common.hlsli
@@ -0,0 +1,204 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (C) 2016-2021, Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// XeGTAO is based on GTAO/GTSO "Jimenez et al. / Practical Real-Time Strategies for Accurate Indirect Occlusion",
+// https://www.activision.com/cdn/research/Practical_Real_Time_Strategies_for_Accurate_Indirect_Occlusion_NEW%20VERSION_COLOR.pdf
+//
+// Implementation:  Filip Strugar (filip.strugar@intel.com), Steve Mccalla <stephen.mccalla@intel.com>         (\_/)
+// Version:         (see XeGTAO.h)                                                                            (='.'=)
+// Details:         https://github.com/GameTechDev/XeGTAO                                                     (")_(")
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// with additional edits by FiveLimbedCat/ProfJack
+
+#ifndef SSGI_COMMON
+#define SSGI_COMMON
+
+#ifndef USE_HALF_FLOAT_PRECISION
+#	define USE_HALF_FLOAT_PRECISION 1
+#endif
+
+#if (USE_HALF_FLOAT_PRECISION != 0)
+#	if 1  // old fp16 approach (<SM6.2)
+typedef min16float lpfloat;
+typedef min16float2 lpfloat2;
+typedef min16float3 lpfloat3;
+typedef min16float4 lpfloat4;
+typedef min16float3x3 lpfloat3x3;
+#	else  // new fp16 approach (requires SM6.2 and -enable-16bit-types) - WARNING: perf degradation noticed on some HW, while the old (min16float) path is mostly at least a minor perf gain so this is more useful for quality testing
+typedef float16_t lpfloat;
+typedef float16_t2 lpfloat2;
+typedef float16_t3 lpfloat3;
+typedef float16_t4 lpfloat4;
+typedef float16_t3x3 lpfloat3x3;
+#	endif
+#else
+typedef float lpfloat;
+typedef float2 lpfloat2;
+typedef float3 lpfloat3;
+typedef float4 lpfloat4;
+typedef float3x3 lpfloat3x3;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#include "../Common/DeferredShared.hlsl"
+
+cbuffer SSGICB : register(b1)
+{
+	float4x4 PrevInvViewMat[2];
+	float4 DepthUnpackConsts;
+	float4 NDCToViewMul;
+	float4 NDCToViewAdd;
+	float4 NDCToViewMul_x_PixelSize;
+
+	float2 FrameDim;
+	float2 RcpFrameDim;
+	uint FrameIndex;
+
+	uint NumSlices;
+	uint NumSteps;
+	float DepthMIPSamplingOffset;
+
+	float EffectRadius;
+	float EffectFalloffRange;
+	float ThinOccluderCompensation;
+	float Thickness;
+	float2 DepthFadeRange;
+	float DepthFadeScaleConst;
+
+	float BackfaceStrength;
+	float GIBounceFade;
+	float GIDistanceCompensation;
+	float GICompensationMaxDist;
+
+	float AOPower;
+	float GIStrength;
+
+	float DepthDisocclusion;
+	uint MaxAccumFrames;
+
+	float pad;
+};
+
+SamplerState samplerPointClamp : register(s0);
+SamplerState samplerLinearClamp : register(s1);
+
+///////////////////////////////////////////////////////////////////////////////
+
+#ifdef HALF_RES
+const static float res_scale = .5;
+#	define READ_DEPTH(tex, px) tex.Load(int3(px, 1))
+#	define FULLRES_LOAD(tex, px, uv, samp) tex.SampleLevel(samp, uv, 0)
+#else
+const static float res_scale = 1.;
+#	define READ_DEPTH(tex, px) tex[px]
+#	define FULLRES_LOAD(tex, px, uv, samp) tex[px]
+#endif
+
+#ifdef VR
+#	define GET_EYE_IDX(uv) (uv.x > 0.5)
+#else
+#	define GET_EYE_IDX(uv) (0)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define ISNAN(x) (!(x < 0.f || x > 0.f || x == 0.f))
+
+// http://h14s.p5r.org/2012/09/0x5f3759df.html, [Drobot2014a] Low Level Optimizations for GCN, https://blog.selfshadow.com/publications/s2016-shading-course/activision/s2016_pbs_activision_occlusion.pdf slide 63
+lpfloat FastSqrt(float x)
+{
+	return (lpfloat)(asfloat(0x1fbd1df5 + (asint(x) >> 1)));
+}
+
+// input [-1, 1] and output [0, PI], from https://seblagarde.wordpress.com/2014/12/01/inverse-trigonometric-functions-gpu-optimization-for-amd-gcn-architecture/
+lpfloat FastACos(lpfloat inX)
+{
+	const lpfloat PI = 3.141593;
+	const lpfloat HALF_PI = 1.570796;
+	lpfloat x = abs(inX);
+	lpfloat res = -0.156583 * x + HALF_PI;
+	res *= FastSqrt(1.0 - x);
+	return (inX >= 0) ? res : PI - res;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Inputs are screen XY and viewspace depth, output is viewspace position
+float3 ScreenToViewPosition(const float2 screenPos, const float viewspaceDepth, const uint eyeIndex)
+{
+	const float2 _mul = eyeIndex == 0 ? NDCToViewMul.xy : NDCToViewMul.zw;
+	const float2 _add = eyeIndex == 0 ? NDCToViewAdd.xy : NDCToViewAdd.zw;
+
+	float3 ret;
+	ret.xy = (_mul * screenPos.xy + _add) * viewspaceDepth;
+	ret.z = viewspaceDepth;
+	return ret;
+}
+
+float ScreenToViewDepth(const float screenDepth, const uint eyeIndex)
+{
+	const float2 consts = eyeIndex == 0 ? DepthUnpackConsts.xy : DepthUnpackConsts.zw;
+
+	float depthLinearizeMul = consts.x;
+	float depthLinearizeAdd = consts.y;
+	// Optimised version of "-cameraClipNear / (cameraClipFar - projDepth * (cameraClipFar - cameraClipNear)) * cameraClipFar"
+	return depthLinearizeMul / (depthLinearizeAdd - screenDepth);
+}
+
+float3 ViewToWorldPosition(const float3 pos, const float4x4 invView)
+{
+	float4 worldpos = mul(invView, float4(pos, 1));
+	return worldpos.xyz / worldpos.w;
+}
+
+float3 ViewToWorldVector(const float3 vec, const float4x4 invView)
+{
+	return mul((float3x3)invView, vec);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// "Efficiently building a matrix to rotate one vector to another"
+// http://cs.brown.edu/research/pubs/pdfs/1999/Moller-1999-EBA.pdf / https://dl.acm.org/doi/10.1080/10867651.1999.10487509
+// (using https://github.com/assimp/assimp/blob/master/include/assimp/matrix3x3.inl#L275 as a code reference as it seems to be best)
+lpfloat3x3 RotFromToMatrix(lpfloat3 from, lpfloat3 to)
+{
+	const lpfloat e = dot(from, to);
+	const lpfloat f = abs(e);  //(e < 0)? -e:e;
+
+	// WARNING: This has not been tested/worked through, especially not for 16bit floats; seems to work in our special use case (from is always {0, 0, -1}) but wouldn't use it in general
+	if (f > lpfloat(1.0 - 0.0003))
+		return lpfloat3x3(1, 0, 0, 0, 1, 0, 0, 0, 1);
+
+	const lpfloat3 v = cross(from, to);
+	/* ... use this hand optimized version (9 mults less) */
+	const lpfloat h = (1.0) / (1.0 + e); /* optimization by Gottfried Chen */
+	const lpfloat hvx = h * v.x;
+	const lpfloat hvz = h * v.z;
+	const lpfloat hvxy = hvx * v.y;
+	const lpfloat hvxz = hvx * v.z;
+	const lpfloat hvyz = hvz * v.y;
+
+	lpfloat3x3 mtx;
+	mtx[0][0] = e + hvx * v.x;
+	mtx[0][1] = hvxy - v.z;
+	mtx[0][2] = hvxz + v.y;
+
+	mtx[1][0] = hvxy + v.z;
+	mtx[1][1] = e + h * v.y * v.y;
+	mtx[1][2] = hvyz - v.x;
+
+	mtx[2][0] = hvxz - v.y;
+	mtx[2][1] = hvyz + v.x;
+	mtx[2][2] = e + hvz * v.z;
+
+	return mtx;
+}
+
+#endif