From 40f080ed816a8f3bac8dabb93b3b4eb465f0e14f Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Thu, 24 Oct 2024 15:14:00 -0700
Subject: [PATCH] Avoid using float32 in normalization for mean/var and
 scale/bias parameters when force_float32_reductions=False

PiperOrigin-RevId: 689532230
---
 flax/linen/normalization.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/flax/linen/normalization.py b/flax/linen/normalization.py
index bf172f2c1d..340b5d03a5 100644
--- a/flax/linen/normalization.py
+++ b/flax/linen/normalization.py
@@ -160,6 +160,7 @@ def _normalize(
   use_scale: bool,
   bias_init: Initializer,
   scale_init: Initializer,
+  force_float32_reductions: bool = True
 ):
   """Normalizes the input of a normalization layer and optionally applies a learned scale and bias.
 
@@ -179,6 +180,9 @@ def _normalize(
     use_scale: If true, scale the output.
     bias_init: Initialization function for the bias term.
     scale_init: Initialization function for the scaling function.
+    force_float32_reductions: If false, the scale and bias parameters use the
+      param_dtype. Otherwise, they will have at least float32 precision due to
+      the mean and var being promoted to float32.
 
   Returns:
     The normalized input.
@@ -200,6 +204,8 @@ def _normalize(
     scale = mdl.param(
       'scale', scale_init, reduced_feature_shape, param_dtype
     ).reshape(feature_shape)
+    if not force_float32_reductions:
+      scale = jnp.asarray(scale, param_dtype)
     mul *= scale
     args.append(scale)
   y *= mul
@@ -207,6 +213,8 @@ def _normalize(
     bias = mdl.param(
       'bias', bias_init, reduced_feature_shape, param_dtype
     ).reshape(feature_shape)
+    if not force_float32_reductions:
+      bias = jnp.asarray(bias, param_dtype)
     y += bias
     args.append(bias)
   dtype = dtypes.canonicalize_dtype(*args, dtype=dtype)
@@ -346,7 +354,8 @@ def __call__(
         'batch_stats',
         'mean',
         lambda s: jnp.zeros(
-            s, jnp.float32 if self.force_float32_reductions else x.dtype
+            s,
+            jnp.float32 if self.force_float32_reductions else self.param_dtype,
         ),
         feature_shape,
     )
@@ -354,13 +363,23 @@ def __call__(
         'batch_stats',
         'var',
         lambda s: jnp.ones(
-            s, jnp.float32 if self.force_float32_reductions else x.dtype
+            s,
+            jnp.float32 if self.force_float32_reductions else self.param_dtype,
         ),
         feature_shape,
     )
 
     if use_running_average:
-      mean, var = ra_mean.value, ra_var.value
+      mean = (
+          ra_mean.value
+          if self.force_float32_reductions
+          else jnp.asarray(ra_mean.value, self.param_dtype)
+      )
+      var = (
+          ra_var.value
+          if self.force_float32_reductions
+          else jnp.asarray(ra_var.value, self.param_dtype)
+      )
     else:
       mean, var = _compute_stats(
           x,
@@ -393,6 +412,7 @@ def __call__(
       self.use_scale,
       self.bias_init,
       self.scale_init,
+      self.force_float32_reductions,
     )
 
 
@@ -509,6 +529,7 @@ def __call__(self, x, *, mask: jax.Array | None = None):
       self.use_scale,
       self.bias_init,
       self.scale_init,
+      self.force_float32_reductions,
     )
 
 
@@ -609,6 +630,7 @@ def __call__(self, x, *, mask: jax.Array | None = None):
       self.use_scale,
       initializers.zeros,
       self.scale_init,
+      self.force_float32_reductions,
     )
 
 
@@ -788,6 +810,7 @@ def __call__(self, x, *, mask: jax.Array | None = None):
       self.use_scale,
       self.bias_init,
       self.scale_init,
+      self.force_float32_reductions,
     )
 
 
@@ -912,6 +935,7 @@ def __call__(self, x, *, mask: jax.Array | None = None):
       self.use_scale,
       self.bias_init,
       self.scale_init,
+      self.force_float32_reductions,
     )