From 1ebb3ce6b5e00c2ee6c5f9597858830aca306262 Mon Sep 17 00:00:00 2001
From: dzhdan <dzhdan@nvidia.com>
Date: Fri, 14 Jun 2024 13:54:38 +0800
Subject: [PATCH] v4.8.2:

HIGHLIGHTS:

- SIGMA improvements

DETAILS:

- SIGMA: fixed 1 pixel wide blur on shadows with penumbra size < 1 pixel
- SIGMA: better behavior for multi-layered shadows (a narrow penumbra inside a wide penumbra)
- SIGMA: reduced potential flickering
- SIGMA: fixed suboptimal output of the blur pass affecting TS pass
- SIGMA: improved weights for moments calculations in TS pass
- NRD: resolved some TODOs
- updated deps
- updated docs
---
 CMakeLists.txt                                |   5 +-
 External/MathLib                              |   2 +-
 Include/NRD.h                                 |   4 +-
 Include/NRDSettings.h                         |   2 +-
 README.md                                     |  17 ++-
 Resources/Version.h                           |   2 +-
 Shaders/Include/Common.hlsli                  |   6 +-
 Shaders/Include/NRD.hlsli                     | 108 +++++++++---------
 Shaders/Include/REBLUR_Common.hlsli           |   4 +-
 Shaders/Include/REBLUR_Config.hlsli           |   3 +-
 Shaders/Include/REBLUR_HistoryFix.hlsli       |   6 +-
 .../Include/REBLUR_TemporalAccumulation.hlsli |   2 +-
 .../REBLUR_TemporalStabilization.hlsli        |   4 +-
 Shaders/Include/RELAX_Config.hlsli            |   1 -
 .../Include/RELAX_TemporalAccumulation.hlsli  |   6 +-
 Shaders/Include/SIGMA_Blur.hlsli              |  72 +++++++-----
 Shaders/Include/SIGMA_Config.hlsli            |  14 ++-
 .../Include/SIGMA_TemporalStabilization.hlsli |  92 +++++++--------
 Shaders/Source/RELAX_Validation.cs.hlsl       |   7 +-
 Source/Sigma.cpp                              |   1 +
 20 files changed, 187 insertions(+), 171 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8553cd1..9c627c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -195,10 +195,7 @@ if (NOT NRD_DISABLE_SHADER_COMPILATION)
 
     if (NRD_EMBEDS_SPIRV_SHADERS)
         set (SHADERMAKE_COMMANDS ${SHADERMAKE_COMMANDS} COMMAND ShaderMake -p SPIRV --compiler "${DXC_SPIRV_PATH}" ${SHADERMAKE_GENERAL_ARGS}
-            --sRegShift 100
-            --tRegShift 200
-            --bRegShift 300
-            --uRegShift 400
+            --sRegShift 100 --tRegShift 200 --bRegShift 300 --uRegShift 400
         )
     endif ()
 
diff --git a/External/MathLib b/External/MathLib
index 903f7ac..63c68ad 160000
--- a/External/MathLib
+++ b/External/MathLib
@@ -1 +1 @@
-Subproject commit 903f7ac918e63e3704de7a621deae6139575b887
+Subproject commit 63c68ad9811c069fde848922df5e5b5475750a1a
diff --git a/Include/NRD.h b/Include/NRD.h
index dffec3d..bc888b5 100644
--- a/Include/NRD.h
+++ b/Include/NRD.h
@@ -29,8 +29,8 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 #define NRD_VERSION_MAJOR 4
 #define NRD_VERSION_MINOR 8
-#define NRD_VERSION_BUILD 1
-#define NRD_VERSION_DATE "16 May 2024"
+#define NRD_VERSION_BUILD 2
+#define NRD_VERSION_DATE "14 June 2024"
 
 #if defined(_MSC_VER)
     #define NRD_CALL __fastcall
diff --git a/Include/NRDSettings.h b/Include/NRDSettings.h
index 83c11db..9fdd960 100644
--- a/Include/NRDSettings.h
+++ b/Include/NRDSettings.h
@@ -115,7 +115,7 @@ namespace nrd
         // (ms) - user provided if > 0, otherwise - tracked internally
         float timeDeltaBetweenFrames = 0.0f;
 
-        // (units) > 0 - use TLAS or tracing range (max value = NRD_FP16_MAX / NRD_FP16_VIEWZ_SCALE - 1 = 524031)
+        // (units) > 0 - use TLAS or tracing range
         float denoisingRange = 500000.0f;
 
         // (normalized %) - if relative distance difference is greater than threshold, history gets reset (0.5-2.5% works well)
diff --git a/README.md b/README.md
index e99058d..294703b 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# NVIDIA REAL-TIME DENOISERS v4.8.1 (NRD)
+# NVIDIA REAL-TIME DENOISERS v4.8.2 (NRD)
 
 [![Build NRD SDK](https://github.com/NVIDIAGameWorks/RayTracingDenoiser/actions/workflows/build.yml/badge.svg)](https://github.com/NVIDIAGameWorks/RayTracingDenoiser/actions/workflows/build.yml)
 
@@ -168,15 +168,18 @@ IN_NORMAL_ROUGHNESS = GetNormalAndRoughnessAt( A );
 IN_MV = GetMotionAt( A );
 ```
 
-See `NRDDescs.h` for more details and descriptions of other inputs and outputs.
+See `NRDDescs.h` and `NRD.hlsli` for more details and descriptions of other inputs and outputs.
 
 # NOISY INPUTS
 
 NRD sample is a good start to familiarize yourself with input requirements and best practices, but main requirements can be summarized to:
 
+Radiance:
 - Since *NRD* denoisers accumulate signals for a limited number of frames, the input signal must converge *reasonably* well for this number of frames. `REFERENCE` denoiser can be used to estimate temporal signal quality
 - Since *NRD* denoisers process signals spatially, high-energy fireflies in the input signal should be avoided. Most of them can be removed by enabling anti-firefly filter in *NRD*, but it will only work if the "background" signal is confident. The worst case is having a single pixel with high energy divided by a very small PDF to represent the lack of energy in neighboring non-representative (black) pixels
 - Radiance must be separated into diffuse and specular at primary hit (or secondary hit in case of *PSR*)
+
+Hit distance:
 - `hitT` can't be negative
 - `hitT` must not include primary hit distance
 - `hitT` for the first bounce after the primary hit or *PSR* must be provided "as is"
@@ -191,9 +194,15 @@ NRD sample is a good start to familiarize yourself with input requirements and b
   - `hitDistanceReconstructionMode` must be set to something other than `OFF`, but bear in mind that the search area is limited to 3x3 or 5x5. In other words, it's the application's responsibility to guarantee a valid sample in this area. It can be achieved by clamping probabilities and using Bayer-like dithering (see the sample for more details)
   - Pre-pass must be enabled (i.e. `diffusePrepassBlurRadius` and `specularPrepassBlurRadius` must be set to 20-70 pixels) to compensate entropy increase, since radiance in valid samples is divided by probability to compensate 0 values in some neighbors
 - Probabilistic sampling for 2nd+ bounces is absolutely acceptable
-- in case of many paths per pixel `hitT` for specular must be "averaged" by `NRD_FrontEnd_SpecHitDistAveraging_*` functions from `NRD.hlsli`
+- In case of many paths per pixel `hitT` for specular must be "averaged" by `NRD_FrontEnd_SpecHitDistAveraging_*` functions from `NRD.hlsli`
+- For *REBLUR* hits distance must be normalized using `REBLUR_FrontEnd_GetNormHitDist`
+
+Distance to occluder:
+- `NoL <= 0` - 0 (it's very important!)
+- `NoL > 0, hit` - hit distance
+- `NoL > 0, miss` - >= NRD_FP16_MAX
 
-See `NRDDescs.h` for more details and descriptions of other inputs and outputs.
+See `NRDDescs.h` and `NRD.hlsli` for more details and descriptions of other inputs and outputs.
 
 # IMPROVING OUTPUT QUALITY
 
diff --git a/Resources/Version.h b/Resources/Version.h
index 38c0b3e..60206da 100644
--- a/Resources/Version.h
+++ b/Resources/Version.h
@@ -23,6 +23,6 @@ Versioning rules:
 
 #define VERSION_MAJOR                   4
 #define VERSION_MINOR                   8
-#define VERSION_BUILD                   1
+#define VERSION_BUILD                   2
 
 #define VERSION_STRING STR(VERSION_MAJOR.VERSION_MINOR.VERSION_BUILD encoding=NRD_NORMAL_ENCODING.NRD_ROUGHNESS_ENCODING)
diff --git a/Shaders/Include/Common.hlsli b/Shaders/Include/Common.hlsli
index 6c1063e..0093c2e 100644
--- a/Shaders/Include/Common.hlsli
+++ b/Shaders/Include/Common.hlsli
@@ -12,9 +12,9 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 // Constants
 
-#define NRD_NONE                                                0
-#define NRD_FRAME                                               1
-#define NRD_PIXEL                                               2
+#define NRD_NONE                                                0 // bad
+#define NRD_FRAME                                               1 // good
+#define NRD_PIXEL                                               2 // better, but leads to divergence
 #define NRD_RANDOM                                              3 // for experiments only
 
 // FP16
diff --git a/Shaders/Include/NRD.hlsli b/Shaders/Include/NRD.hlsli
index 0140b9a..3624c32 100644
--- a/Shaders/Include/NRD.hlsli
+++ b/Shaders/Include/NRD.hlsli
@@ -16,54 +16,56 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 // INPUT PARAMETERS
 //=================================================================================================================================
 /*
-float3 radiance:
-    - radiance should not include material information ( use material de-modulation to decouple materials )
-    - radiance should not be premultiplied by "exposure"
-    - for Primary Surface Replacements ( PSR ) throughput should be de-modulated as much as possible ( see test 184 from the sample and TraceOpaque.hlsl )
-    - for diffuse rays
-        - use COS-distribution ( or custom importance sampling )
-        - if radiance is the result of path tracing, pass normalized hit distance as the sum of 1-all hits (always ignore primary hit!)
-    - for specular
-        - use VNDF sampling ( or custom importance sampling )
-            - most advanced v3 version: https://gpuopen.com/download/publications/Bounded_VNDF_Sampling_for_Smith-GGX_Reflections.pdf
-        - if radiance is the result of path tracing, pass hit distance for the 1st bounce for the first time (always ignore primary hit!)
-
-float hitDist:
-    - can't be negative
-    - must not include primary hit distance
-    - for the first bounce after the primary hit or PSR must be provided "as is"
-    - for susequent bounces must be adjusted by curvature and lobe energy dissipation on the application side
-    - must be explicitly set to 0 for rays pointing inside the surface ( better to nopt cast such rays )
-
-float normHitDist:
-    - normalized hit distance, gotten by using "REBLUR_FrontEnd_GetNormHitDist"
-    - REBLUR must be aware of the normalization function via "nrd::HitDistanceParameters"
-    - by definition, normalized hit distance is AO ( ambient occlusion ) for diffuse and SO ( specular occlusion ) for specular
-    - AO can be used to emulate 2nd+ diffuse bounces
-    - SO can be used to adjust IBL lighting
-    - ".w" channel of diffuse / specular output is AO / SO
-    - if you don't know which normalization function to choose use default values of "nrd::HitDistanceParameters"
-
-float roughness:
-    - "linear roughness" = sqrt( "m" ), where "m" = "alpha" - GGX roughness
-    - usage: "isDiffuse ? 1.0 : roughness"
-
-float normal:
-    - world-space normal
-
-float viewZ:
-    - linear view space Z for primary rays ( linearized camera depth )
-
-float distanceToOccluder:
-    - distance to occluder, must follow the rules:
-        - NoL <= 0         - 0 ( it's very important )
-        - NoL > 0 ( hit )  - hit distance
-        - NoL > 0 ( miss ) - >= NRD_FP16_MAX
-
-float tanOfLightAngularRadius:
-    - tan( lightAngularSize * 0.5 )
-    - angular size is computed from the shadow receiving point
-    - in other words, tanOfLightAngularRadius = lightRadius / distanceToLight
+NON-NOISY INPUTS:
+    float viewZ:
+        - linear view space Z for primary rays ( linearized camera depth )
+
+    float normal:
+        - world-space normal
+
+    float roughness:
+        - "linear roughness" = sqrt( "m" ), where "m" = "alpha" - GGX roughness
+        - usage: "isDiffuse ? 1.0 : roughness"
+
+    float tanOfLightAngularRadius:
+        - tan( lightAngularSize * 0.5 )
+        - angular size is computed from the shadow receiving point
+        - in other words, tanOfLightAngularRadius = lightRadius / distanceToLight
+
+NOISY INPUTS:
+    float3 radiance:
+        - radiance should not include material information ( use material de-modulation to decouple materials )
+        - radiance should not be premultiplied by "exposure"
+        - for Primary Surface Replacements ( PSR ) throughput should be de-modulated as much as possible ( see test 184 from the sample and TraceOpaque.hlsl )
+        - for diffuse rays
+            - use COS-distribution ( or custom importance sampling )
+            - if radiance is the result of path tracing, pass normalized hit distance as the sum of 1-all hits (always ignore primary hit!)
+        - for specular
+            - use VNDF sampling ( or custom importance sampling )
+                - most advanced v3 version: https://gpuopen.com/download/publications/Bounded_VNDF_Sampling_for_Smith-GGX_Reflections.pdf
+            - if radiance is the result of path tracing, pass hit distance for the 1st bounce for the first time (always ignore primary hit!)
+
+    float hitDist:
+        - can't be negative
+        - must not include primary hit distance
+        - for the first bounce after the primary hit or PSR must be provided "as is"
+        - for susequent bounces must be adjusted by curvature and lobe energy dissipation on the application side
+        - must be explicitly set to 0 for rays pointing inside the surface ( better to nopt cast such rays )
+
+    float normHitDist:
+        - logically same as "hitDist", but normalized to [0; 1] range using "REBLUR_FrontEnd_GetNormHitDist"
+        - REBLUR must be aware of the normalization function via "nrd::HitDistanceParameters"
+        - by definition, normalized hit distance is AO ( ambient occlusion ) for diffuse and SO ( specular occlusion ) for specular
+        - AO can be used to emulate 2nd+ diffuse bounces
+        - SO can be used to adjust IBL lighting
+        - ".w" channel of diffuse / specular output is AO / SO
+        - if you don't know which normalization function to choose use default values of "nrd::HitDistanceParameters"
+
+    float distanceToOccluder:
+        - distance to occluder, must follow the rules:
+            - NoL <= 0         - 0 ( it's very important )
+            - NoL > 0 ( hit )  - hit distance
+            - NoL > 0 ( miss ) - >= NRD_FP16_MAX
 */
 
 #ifndef NRD_INCLUDED
@@ -275,7 +277,6 @@ float tanOfLightAngularRadius:
 #define NRD_ROUGHNESS_ENCODING_SQRT_LINEAR                                              2 // sqrt( linearRoughness )
 
 #define NRD_FP16_MAX                                                                    65504.0
-#define NRD_FP16_VIEWZ_SCALE                                                            0.125 // TODO: tuned for meters, needs to be scaled down for cm and mm
 #define NRD_PI                                                                          3.14159265358979323846
 #define NRD_EPS                                                                         1e-6
 #define NRD_REJITTER_VIEWZ_THRESHOLD                                                    0.01 // normalized %
@@ -627,11 +628,12 @@ void NRD_FrontEnd_SpecHitDistAveraging_End( inout float accumulatedSpecHitDist )
 //=================================================================================================================================
 
 // This function returns AO / SO which REBLUR can decode back to "hit distance" internally
-float REBLUR_FrontEnd_GetNormHitDist( float hitDist, float viewZ, float4 hitDistParams, float roughness )
+float REBLUR_FrontEnd_GetNormHitDist( float hitDist, float viewZ, float4 hitDistParams, float roughness, float trimmingThreshold = 0.0 )
 {
-    // TODO: Sampling can produce rays pointing inside surface, i.e. "hitDist = 0". But due to ray offsetting
-    // actual "hitDist" can be a very small value in this case. Since NRD handles "hitDist = 0" case, should be
-    // small "hitDist" values trimmed to 0?
+    // Sampling can produce rays pointing inside surface, i.e. "hitDist = 0". But due to ray offsetting actual "hitDist" can be a
+    // very small value in this case. Since NRD has been designed to handle "hitDist = 0" case, accidentally small "hitDist" values
+    // better trim to 0
+    hitDist = hitDist < trimmingThreshold ? 0.0 : hitDist;
 
     float f = _REBLUR_GetHitDistanceNormalization( viewZ, hitDistParams, roughness );
 
diff --git a/Shaders/Include/REBLUR_Common.hlsli b/Shaders/Include/REBLUR_Common.hlsli
index 0864f69..b2ad6d5 100644
--- a/Shaders/Include/REBLUR_Common.hlsli
+++ b/Shaders/Include/REBLUR_Common.hlsli
@@ -22,8 +22,8 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 // Internal data ( from the previous frame )
 
-#define REBLUR_PackViewZ( p )                           min( p * NRD_FP16_VIEWZ_SCALE, NRD_FP16_MAX )
-#define REBLUR_UnpackViewZ( p )                         ( p / NRD_FP16_VIEWZ_SCALE )
+#define REBLUR_PackViewZ( p )                           min( p * REBLUR_FP16_VIEWZ_SCALE, NRD_FP16_MAX )
+#define REBLUR_UnpackViewZ( p )                         ( p / REBLUR_FP16_VIEWZ_SCALE )
 
 float4 PackNormalRoughness( float4 p )
 {
diff --git a/Shaders/Include/REBLUR_Config.hlsli b/Shaders/Include/REBLUR_Config.hlsli
index 351c260..391ba92 100644
--- a/Shaders/Include/REBLUR_Config.hlsli
+++ b/Shaders/Include/REBLUR_Config.hlsli
@@ -54,7 +54,7 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 #define REBLUR_POISSON_SAMPLE_NUM                               8
 #define REBLUR_POISSON_SAMPLES( i )                             g_Special8[ i ]
 
-#define REBLUR_PRE_BLUR_ROTATOR_MODE                            NRD_FRAME // TODO: others are expensive, but work better
+#define REBLUR_PRE_BLUR_ROTATOR_MODE                            NRD_FRAME
 #define REBLUR_PRE_BLUR_FRACTION_SCALE                          2.0
 #define REBLUR_PRE_BLUR_NON_LINEAR_ACCUM_SPEED                  ( 1.0 / ( 1.0 + 10.0 ) )
 
@@ -67,6 +67,7 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 #define REBLUR_HIT_DIST_MIN_WEIGHT( smc )                       ( 0.1 * smc ) // was 0.1
 
+#define REBLUR_FP16_VIEWZ_SCALE                                 ( gViewZScale * 0.125) // TODO: tuned for meters, i.e. gViewZScale = 1.0
 #define REBLUR_MAX_PERCENT_OF_LOBE_VOLUME                       0.75
 #define REBLUR_VIRTUAL_MOTION_PREV_PREV_WEIGHT_ITERATION_NUM    1
 #define REBLUR_COLOR_CLAMPING_SIGMA_SCALE                       2.0 // using smaller values leads to bias if camera rotates slowly due to reprojection instabilities
diff --git a/Shaders/Include/REBLUR_HistoryFix.hlsli b/Shaders/Include/REBLUR_HistoryFix.hlsli
index 7f0518d..8e78efb 100644
--- a/Shaders/Include/REBLUR_HistoryFix.hlsli
+++ b/Shaders/Include/REBLUR_HistoryFix.hlsli
@@ -293,16 +293,12 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
         // Stride between taps
         float smc = GetSpecMagicCurve( roughness );
         float specStride = stride.y * float( frameNum.y < gHistoryFixFrameNum );
-        specStride *= lerp( 0.5, 1.0, smc ); // TODO: seems to work better than "minBlurRadius"
+        specStride *= lerp( 0.5, 1.0, smc ); // hand tuned
         specStride = floor( specStride );
 
         // History reconstruction
         if( specStride != 0 )
         {
-            // TODO: introduce IN_SECONDARY_ROUGHNESS:
-            //  - to allow blur on diffuse-like surfaces in reflection
-            //  - use "hitDistanceWeight" only for very low primary roughness to avoid color bleeding from one surface to another
-
             int specStridei = int( specStride + 0.5 );
 
             // Parameters
diff --git a/Shaders/Include/REBLUR_TemporalAccumulation.hlsli b/Shaders/Include/REBLUR_TemporalAccumulation.hlsli
index 270d527..3ace181 100644
--- a/Shaders/Include/REBLUR_TemporalAccumulation.hlsli
+++ b/Shaders/Include/REBLUR_TemporalAccumulation.hlsli
@@ -76,7 +76,7 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
             float4 normalAndRoughness = s_Normal_Roughness[ pos.y ][ pos.x ];
 
             // Average normal
-            if( i < 2 && j < 2 ) // TODO: is backward 2x2 OK?
+            if( i < 2 && j < 2 )
                 Navg += normalAndRoughness.xyz;
 
             #ifdef REBLUR_SPECULAR
diff --git a/Shaders/Include/REBLUR_TemporalStabilization.hlsli b/Shaders/Include/REBLUR_TemporalStabilization.hlsli
index fc5063e..b43d294 100644
--- a/Shaders/Include/REBLUR_TemporalStabilization.hlsli
+++ b/Shaders/Include/REBLUR_TemporalStabilization.hlsli
@@ -8,8 +8,6 @@ distribution of this software and related documentation without an express
 license agreement from NVIDIA CORPORATION is strictly prohibited.
 */
 
-// TODO: add REBLUR_OCCLUSION support to TemporalStabilization?
-
 groupshared float4 s_Diff[ BUFFER_Y ][ BUFFER_X ];
 groupshared float4 s_Spec[ BUFFER_Y ][ BUFFER_X ];
 
@@ -321,7 +319,7 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
             float f = STL::Math::SmoothStep( gSpecProbabilityThresholdsForMvModification.x, gSpecProbabilityThresholdsForMvModification.y, specProb );
             if( STL::Rng::Hash::GetFloat( ) < f )
             {
-                float3 specMv = Xvirtual - X; // TODO: world-space delta fits badly into FP16
+                float3 specMv = Xvirtual - X; // world-space delta fits badly into FP16! Prefer 2.5D motion!
                 if( gMvScale.w == 0.0 )
                 {
                     specMv.xy = vmbPixelUv - pixelUv;
diff --git a/Shaders/Include/RELAX_Config.hlsli b/Shaders/Include/RELAX_Config.hlsli
index d7ec060..1e0019e 100644
--- a/Shaders/Include/RELAX_Config.hlsli
+++ b/Shaders/Include/RELAX_Config.hlsli
@@ -12,7 +12,6 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 // Settings
 #define RELAX_MAX_ACCUM_FRAME_NUM                           255
-#define RELAX_SPEC_DOMINANT_DIRECTION                       STL_SPECULAR_DOMINANT_DIRECTION_G2 // TODO: move to Common, change REBLUR too
 #define RELAX_HIT_DIST_MIN_WEIGHT                           0.2 // Sacrifices spatial fidelity to improve temporal stability. Should be set to 0 for relatively clean input signals like RTXDI and 0.1 .. 0.2 for lower quality input signals
 #define RELAX_ANTILAG_ACCELERATION_AMOUNT_SCALE             10.0 // Multiplier used to put RelaxAntilagSettings::accelerationAmount to convenient [0; 1] range
 
diff --git a/Shaders/Include/RELAX_TemporalAccumulation.hlsli b/Shaders/Include/RELAX_TemporalAccumulation.hlsli
index bdaf2a5..07b7860 100644
--- a/Shaders/Include/RELAX_TemporalAccumulation.hlsli
+++ b/Shaders/Include/RELAX_TemporalAccumulation.hlsli
@@ -725,10 +725,6 @@ NRD_EXPORT void NRD_CS_MAIN(uint2 pixelPos : SV_DispatchThreadId, uint2 threadPo
     // Thin lens equation for adjusting reflection HitT
     float hitDistFocused = ApplyThinLensEquation(hitDist, curvature);
 
-    [flatten]
-    if (abs(hitDistFocused) < 0.001) // TODO: why?
-        hitDistFocused = 0.001;
-
     // Loading specular data based on virtual motion
     float4 prevSpecularIlluminationAnd2ndMomentVMB;
     float4 prevSpecularIlluminationAnd2ndMomentVMBResponsive;
@@ -770,7 +766,7 @@ NRD_EXPORT void NRD_CS_MAIN(uint2 pixelPos : SV_DispatchThreadId, uint2 threadPo
     );
 
     // Amount of virtual motion - dominant factor
-    float4 D = STL::ImportanceSampling::GetSpecularDominantDirection(currentNormal, V, currentRoughnessModified, RELAX_SPEC_DOMINANT_DIRECTION);
+    float4 D = STL::ImportanceSampling::GetSpecularDominantDirection(currentNormal, V, currentRoughnessModified, STL_SPECULAR_DOMINANT_DIRECTION_G2);
     float virtualHistoryAmount = VMBReprojectionFound * D.w;
 
     // Decreasing virtual history amount for ortho case
diff --git a/Shaders/Include/SIGMA_Blur.hlsli b/Shaders/Include/SIGMA_Blur.hlsli
index 86592b2..90cb22d 100644
--- a/Shaders/Include/SIGMA_Blur.hlsli
+++ b/Shaders/Include/SIGMA_Blur.hlsli
@@ -50,7 +50,7 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     int2 smemPos = threadPos + BORDER;
     float2 centerData = s_Penumbra_ViewZ[ smemPos.y ][ smemPos.x ];
     float centerPenumbra = centerData.x;
-    float centerSignNoL = float( centerData.x != 0.0 );
+    float centerSignNoL = float( centerPenumbra != 0.0 );
     float viewZ = centerData.y;
 
     // Early out
@@ -72,7 +72,7 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
 
     if( ( tileValue == 0.0 && NRD_USE_TILE_CHECK ) || centerPenumbra == 0.0 )
     {
-        gOut_Penumbra[ pixelPos ] = 0;
+        gOut_Penumbra[ pixelPos ] = centerPenumbra;
         gOut_Shadow_Translucency[ pixelPos ] = PackShadow( s_Shadow_Translucency[ smemPos.y ][ smemPos.x ] );
 
         return;
@@ -87,13 +87,15 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     float3 Nv = STL::Geometry::RotateVector( gWorldToView, N );
 
     // Parameters
-    float frustumSize = PixelRadiusToWorld( gUnproject, gOrthoMode, min( gRectSize.x, gRectSize.y ), viewZ ); // TODO: use GetFrustumSize
+    float unprojectZ = PixelRadiusToWorld( gUnproject, gOrthoMode, 1.0, viewZ );
+    float frustumSize = GetFrustumSize( gMinRectDimMulUnproject, gOrthoMode, viewZ );
     float2 geometryWeightParams = GetGeometryWeightParams( gPlaneDistSensitivity, frustumSize, Xv, Nv, 1.0 );
 
-    // Estimate average distance to occluder
+    // Estimate penumbra size and filter shadow ( pass 1: dense 3x3 or 5x5 )
     float2 sum = 0;
     float penumbra = 0;
     SIGMA_TYPE result = 0;
+    SIGMA_TYPE centerTap;
 
     [unroll]
     for( j = 0; j <= BORDER * 2; j++ )
@@ -104,12 +106,19 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
             int2 pos = threadPos + int2( i, j );
 
             float2 data = s_Penumbra_ViewZ[ pos.y ][ pos.x ];
-            float p = data.x;
-            float signNoL = float( p != 0.0 );
+            float penum = data.x;
             float z = data.y;
+            float signNoL = float( penum != 0.0 );
+
+            SIGMA_TYPE s = s_Shadow_Translucency[ pos.y ][ pos.x ];
 
-            float w = 1.0;
-            if( !( i == BORDER && j == BORDER ) )
+            float w;
+            if( i == BORDER && j == BORDER )
+            {
+                centerTap = s;
+                w = 1.0;
+            }
+            else
             {
                 float2 uv = pixelUv + float2( i - BORDER, j - BORDER ) * gRectSizeInv;
                 float3 Xvs = STL::Geometry::ReconstructViewPosition( uv, gFrustum, z, gOrthoMode );
@@ -119,25 +128,32 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
                 w *= GetGaussianWeight( length( float2( i - BORDER, j - BORDER ) / BORDER ) );
                 w *= float( z < gDenoisingRange );
                 w *= float( centerSignNoL == signNoL );
-            }
 
-            SIGMA_TYPE s = s_Shadow_Translucency[ pos.y ][ pos.x ];
-            s = Denanify( w, s );
+                s = Denanify( w, s );
+            }
 
             float2 ww = w;
-            ww.y *= !IsLit( p );
-            ww.y *= 1.0 / ( 1.0 + p * SIGMA_PENUMBRA_WEIGHT_SCALE ); // prefer smaller penumbra
+            ww.y *= !IsLit( penum );
+
+            float penumInPixels = penum / unprojectZ;
+            ww.y /= 1.0 + penumInPixels; // prefer smaller penumbra
 
             result += s * ww.x;
-            penumbra += p * ww.y;
+            penumbra += penum * ww.y;
             sum += ww;
         }
     }
 
-    result /= sum.x; // TODO: lerp to center if blur radius < BORDER
+    result /= sum.x;
+    sum.x = 1.0;
+
     penumbra /= max( sum.y, NRD_EPS ); // yes, without patching
+    sum.y = float( sum.y != 0.0 );
 
-    float invHitDist = 1.0 / max( penumbra, NRD_EPS );
+    // Avoid 1-pixel wide blur if penumbra size < 1 pixel
+    float penumbraInPixels = penumbra / unprojectZ;
+    float f = STL::Math::LinearStep( 0.75, 1.25, penumbraInPixels );
+    result = lerp( centerTap, result, f );
 
     // Tangent basis with anisotropy
     float3x3 mWorldToLocal = STL::Geometry::GetBasis( Nv );
@@ -158,7 +174,6 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     }
 
     // Blur radius
-    float unprojectZ = PixelRadiusToWorld( gUnproject, gOrthoMode, 1.0, viewZ );
     float worldRadius = GetKernelRadiusInPixels( penumbra, unprojectZ, tileValue ) * unprojectZ;
 
     Tv *= worldRadius;
@@ -167,9 +182,8 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     // Random rotation
     float4 rotator = GetBlurKernelRotation( SIGMA_ROTATOR_MODE, pixelPos, gRotator, gFrameIndex );
 
-    // Denoising
-    sum.x = 1.0;
-    sum.y = float( sum.y != 0.0 );
+    // Estimate penumbra size and filter shadow ( pass 2: sparse 8-taps )
+    float invEstimatedPenumbra = 1.0 / max( penumbra, NRD_EPS );
 
     [unroll]
     for( uint n = 0; n < SIGMA_POISSON_SAMPLE_NUM; n++ )
@@ -185,9 +199,9 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
         float2 uvScaled = ClampUvToViewport( uv );
 
         // Fetch data
-        float p = gIn_Penumbra.SampleLevel( gNearestClamp, uvScaled, 0 );
-        float signNoL = float( p != 0.0 );
+        float penum = gIn_Penumbra.SampleLevel( gNearestClamp, uvScaled, 0 );
         float z = UnpackViewZ( gIn_ViewZ.SampleLevel( gNearestClamp, WithRectOffset( uvScaled ), 0 ) );
+        float signNoL = float( penum != 0.0 );
 
         // Sample weight
         float3 Xvs = STL::Geometry::ReconstructViewPosition( uv, gFrustum, z, gOrthoMode );
@@ -200,15 +214,15 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
         w *= float( centerSignNoL == signNoL );
 
         // Avoid umbra leaking inside wide penumbra
-        float t = saturate( p * invHitDist );
-        w *= STL::Math::LinearStep( 0.0, 0.1, t );
+        float t = saturate( penum * invEstimatedPenumbra );
+        w *= STL::Math::SmoothStep( 0.0, 1.0, t ); // TODO: it works surprisingly well, keep an eye on it!
 
         // Fetch shadow
         SIGMA_TYPE s;
         #if( !defined SIGMA_FIRST_PASS || defined SIGMA_TRANSLUCENT )
             s = gIn_Shadow_Translucency.SampleLevel( gNearestClamp, uvScaled, 0 );
         #else
-            s = IsLit( p );
+            s = IsLit( penum );
         #endif
         s = Denanify( w, s );
 
@@ -218,11 +232,13 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
 
         // Accumulate
         float2 ww = w;
-        ww.y *= !IsLit( p );
-        ww.y *= 1.0 / ( 1.0 + p * SIGMA_PENUMBRA_WEIGHT_SCALE ); // prefer smaller penumbra
+        ww.y *= !IsLit( penum );
+
+        float penumInPixels = penum / unprojectZ;
+        ww.y /= 1.0 + penumInPixels; // prefer smaller penumbra
 
         result += s * ww.x;
-        penumbra += p * ww.y;
+        penumbra += penum * ww.y;
         sum += ww;
     }
 
diff --git a/Shaders/Include/SIGMA_Config.hlsli b/Shaders/Include/SIGMA_Config.hlsli
index 8b61452..f542398 100644
--- a/Shaders/Include/SIGMA_Config.hlsli
+++ b/Shaders/Include/SIGMA_Config.hlsli
@@ -16,17 +16,22 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
 #define SIGMA_5X5_BLUR_RADIUS_ESTIMATION_KERNEL         1 // helps to improve stability, but adds 10% of overhead
 
 // Switches ( default 0 )
-#define SIGMA_SHOW_TILES                                0
+#define SIGMA_SHOW                                      0 // 1 - tiles, 2 - history weight
 #define SIGMA_SHOW_PENUMBRA_SIZE                        0
 
 // Settings
 #define SIGMA_ROTATOR_MODE                              NRD_FRAME
 #define SIGMA_POISSON_SAMPLE_NUM                        8
 #define SIGMA_POISSON_SAMPLES                           g_Special8
-#define SIGMA_MAX_PIXEL_RADIUS                          16.0 // TODO: at least 32 needed for test 200
-#define SIGMA_PENUMBRA_WEIGHT_SCALE                     10.0
-#define SIGMA_MAX_SIGMA_SCALE                           3.0
+#define SIGMA_MAX_PIXEL_RADIUS                          32.0
+#define SIGMA_TS_SIGMA_SCALE                            3.0
+#define SIGMA_TS_MAX_HISTORY_WEIGHT                     0.95
+#define SIGMA_TS_Z_FALLOFF                              1.0 // exp2( -SIGMA_TS_Z_FALLOFF * dz )
 #define SIGMA_TS_MOTION_MAX_REUSE                       0.11
+#define SIGMA_TS_EARLY_OUT_THRESHOLD                    0.25
+#define SIGMA_ANTILAG_SIGMA_SCALE                       0.25
+#define SIGMA_ANTILAG_POWER                             1.0
+#define SIGMA_ANTILAG_EPS                               0.05
 
 // Data type
 #ifdef SIGMA_TRANSLUCENT
@@ -62,4 +67,5 @@ license agreement from NVIDIA CORPORATION is strictly prohibited.
     NRD_CONSTANT( float, gDebug ) \
     NRD_CONSTANT( float, gSplitScreen ) \
     NRD_CONSTANT( float, gViewZScale ) \
+    NRD_CONSTANT( float, gMinRectDimMulUnproject ) \
     NRD_CONSTANT( uint, gFrameIndex )
diff --git a/Shaders/Include/SIGMA_TemporalStabilization.hlsli b/Shaders/Include/SIGMA_TemporalStabilization.hlsli
index e7db0b8..598795d 100644
--- a/Shaders/Include/SIGMA_TemporalStabilization.hlsli
+++ b/Shaders/Include/SIGMA_TemporalStabilization.hlsli
@@ -50,7 +50,10 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
         return;
 
     // Early out
-    if( centerPenumbra == 0.0 && SIGMA_SHOW_TILES == 0 )
+    float unprojectZ = PixelRadiusToWorld( gUnproject, gOrthoMode, 1.0, viewZ );
+    float penumbraInPixels = centerPenumbra / unprojectZ;
+
+    if( penumbraInPixels <= SIGMA_TS_EARLY_OUT_THRESHOLD && SIGMA_SHOW == 0 )
     {
         gOut_Shadow_Translucency[ pixelPos ] = PackShadow( s_Shadow_Translucency[ smemPos.y ][ smemPos.x ] );
 
@@ -76,16 +79,18 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
             float2 data = s_Penumbra_ViewZ[ pos.y ][ pos.x ];
 
             SIGMA_TYPE s = s_Shadow_Translucency[ pos.y ][ pos.x ];
-            float signNoL = float( data.x != 0.0 );
+            float penum = data.x;
             float z = data.y;
+            float signNoL = float( penum != 0.0 );
 
             float w = 1.0;
             if( i == BORDER && j == BORDER )
                 input = s;
             else
             {
-                w = GetBilateralWeight( z, viewZ );
-                w *= saturate( 1.0 - abs( centerSignNoL - signNoL ) );
+                w = exp2( -SIGMA_TS_Z_FALLOFF * abs( z - viewZ ) );
+                w *= float( z < gDenoisingRange );
+                w *= float( centerSignNoL == signNoL );
 
                 if( z < viewZnearest )
                 {
@@ -123,49 +128,31 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     history = max( history, 0.0 );
     history = SIGMA_BackEnd_UnpackShadow( history );
 
-    // Clamp history
-    float2 a = m1.xx;
-    float2 b = history.xx;
-
-    #ifdef SIGMA_TRANSLUCENT
-        a.y = STL::Color::Luminance( m1.yzw );
-        b.y = STL::Color::Luminance( history.yzw );
-    #endif
+    // Antilag
+    float fast = m1.x;
+    float slow = history.x;
 
-    float2 ratio = abs( a - b ) / ( min( a, b ) + 0.05 );
-    float2 ratioNorm = ratio / ( 1.0 + ratio );
-    float2 scale = lerp( SIGMA_MAX_SIGMA_SCALE, 1.0, STL::Math::Sqrt01( ratioNorm ) );
+    float a = abs( slow - fast ) - SIGMA_ANTILAG_SIGMA_SCALE * sigma.x - SIGMA_ANTILAG_EPS;
+    float b = max( slow, fast ) + SIGMA_ANTILAG_SIGMA_SCALE * sigma.x + SIGMA_ANTILAG_EPS;
+    float antilag = a / b;
 
-    #ifdef SIGMA_TRANSLUCENT
-        sigma *= scale.xyyy;
-    #else
-        sigma *= scale.x;
-    #endif
+    antilag = STL::Math::SmoothStep01( 1.0 - antilag );
+    antilag = STL::Math::Pow01( antilag, SIGMA_ANTILAG_POWER );
 
-    SIGMA_TYPE inputMin = m1 - sigma;
-    SIGMA_TYPE inputMax = m1 + sigma;
+    // Clamp history
+    SIGMA_TYPE inputMin = m1 - sigma * SIGMA_TS_SIGMA_SCALE;
+    SIGMA_TYPE inputMax = m1 + sigma * SIGMA_TS_SIGMA_SCALE;
     SIGMA_TYPE historyClamped = clamp( history, inputMin, inputMax );
 
     // History weight
-    float isInScreen = IsInScreenNearest( pixelUvPrev );
-    float motionLength = length( pixelUvPrev - pixelUv );
-    float2 historyWeight = 0.93 * lerp( 1.0, 0.7, ratioNorm );
-    historyWeight = lerp( historyWeight, 0.1, saturate( motionLength / SIGMA_TS_MOTION_MAX_REUSE ) );
-    historyWeight *= isInScreen;
+    float historyWeight = SIGMA_TS_MAX_HISTORY_WEIGHT;
+    historyWeight *= IsInScreenNearest( pixelUvPrev );
+    historyWeight *= antilag;
+    historyWeight *= STL::Math::SmoothStep( SIGMA_TS_EARLY_OUT_THRESHOLD, 1.0, penumbraInPixels );
     historyWeight *= gStabilizationStrength;
 
-    // Reduce history in regions with hard shadows
-    float unprojectZ = PixelRadiusToWorld( gUnproject, gOrthoMode, 1.0, viewZ );
-    float pixelRadius = GetKernelRadiusInPixels( centerPenumbra, unprojectZ );
-    historyWeight *= STL::Math::LinearStep( 0.0, 0.5, pixelRadius );
-
     // Combine with current frame
-    SIGMA_TYPE result;
-    result.x = lerp( input.x, historyClamped.x, historyWeight.x );
-
-    #ifdef SIGMA_TRANSLUCENT
-        result.yzw = lerp( input.yzw, historyClamped.yzw, historyWeight.y );
-    #endif
+    SIGMA_TYPE result = lerp( input, historyClamped, historyWeight );
 
     // Reference
     #if( SIGMA_REFERENCE == 1 )
@@ -173,18 +160,25 @@ NRD_EXPORT void NRD_CS_MAIN( int2 threadPos : SV_GroupThreadId, int2 pixelPos :
     #endif
 
     // Debug
-    #if( SIGMA_SHOW_TILES == 1 )
-        float tileValue = gIn_Tiles[ pixelPos >> 4 ].x;
-        tileValue = float( tileValue != 0.0 ); // optional, just to show fully discarded tiles
-
-        #ifdef SIGMA_TRANSLUCENT
-            result = lerp( float4( 0, 0, 1, 0 ), result, tileValue );
-        #else
-            result = tileValue;
+    #if( SIGMA_SHOW != 0 )
+        #if( SIGMA_SHOW == 1 )
+            float tileValue = gIn_Tiles[ pixelPos >> 4 ].x;
+            tileValue = float( tileValue != 0.0 ); // optional, just to show fully discarded tiles
+
+            #ifdef SIGMA_TRANSLUCENT
+                result = lerp( float4( 0, 0, 1, 0 ), result, tileValue );
+            #else
+                result = tileValue;
+            #endif
+
+            // Show grid ( works badly with TAA )
+            result *= all( ( pixelPos & 15 ) != 0 );
+        #elif( SIGMA_SHOW == 2 )
+            // .x - is used in antilag computations!
+            #ifdef SIGMA_TRANSLUCENT
+                result.yzw = SIGMA_BackEnd_UnpackShadow( historyWeight );
+            #endif
         #endif
-
-        // Show grid (works badly with TAA)
-        result *= all( ( pixelPos & 15 ) != 0 );
     #endif
 
     // Output
diff --git a/Shaders/Source/RELAX_Validation.cs.hlsl b/Shaders/Source/RELAX_Validation.cs.hlsl
index f012e59..bc46853 100644
--- a/Shaders/Source/RELAX_Validation.cs.hlsl
+++ b/Shaders/Source/RELAX_Validation.cs.hlsl
@@ -45,10 +45,11 @@ NRD_EXPORT void NRD_CS_MAIN( uint2 pixelPos : SV_DispatchThreadId )
 
     float2 viewportUvScaled = viewportUv * gResolutionScale;
 
+    float4 normalAndRoughness = NRD_FrontEnd_UnpackNormalAndRoughness( gIn_Normal_Roughness.SampleLevel( gNearestClamp, WithRectOffset( viewportUvScaled ), 0 ) );
+    float viewZ = UnpackViewZ( gIn_ViewZ.SampleLevel( gNearestClamp, WithRectOffset( viewportUvScaled ), 0 ) );
+    float3 mv = gIn_Mv.SampleLevel( gNearestClamp, WithRectOffset( viewportUvScaled ), 0 ) * gMvScale.xyz;
+
     float historyLength = 255.0 * gIn_HistoryLength.SampleLevel( gNearestClamp, viewportUvScaled, 0 ) - 1.0;
-    float4 normalAndRoughness = NRD_FrontEnd_UnpackNormalAndRoughness( gIn_Normal_Roughness.SampleLevel( gNearestClamp, viewportUvScaled, 0 ) );
-    float viewZ = gIn_ViewZ.SampleLevel( gNearestClamp, viewportUvScaled, 0 );
-    float3 mv = gIn_Mv.SampleLevel( gNearestClamp, viewportUvScaled, 0 ) * gMvScale.xyz;
 
     float3 N = normalAndRoughness.xyz;
     float roughness = normalAndRoughness.w;
diff --git a/Source/Sigma.cpp b/Source/Sigma.cpp
index 9f073f0..5e6810b 100644
--- a/Source/Sigma.cpp
+++ b/Source/Sigma.cpp
@@ -123,6 +123,7 @@ void nrd::InstanceImpl::AddSharedConstants_Sigma(const SigmaSettings& settings,
     consts->gDebug                  = m_CommonSettings.debug;
     consts->gSplitScreen            = m_CommonSettings.splitScreen;
     consts->gViewZScale             = m_CommonSettings.viewZScale;
+    consts->gMinRectDimMulUnproject = (float)Min(rectW, rectH) * unproject;
     consts->gFrameIndex             = m_CommonSettings.frameIndex;
 }