[nnx] remove flagslib

google · Mar 13, 2024 · db0e96a · db0e96a
1 parent ce8a3c7
commit db0e96a
Show file tree

Hide file tree

Showing 14 changed files with 163 additions and 129 deletions.
diff --git a/flax/experimental/nnx/__init__.py b/flax/experimental/nnx/__init__.py
@@ -23,7 +23,6 @@
 from .nnx.errors import TraceContextError as TraceContextError
 from .nnx.filterlib import All as All
 from .nnx.filterlib import Not as Not
-from .nnx.flaglib import flags as flags
 from .nnx.graph_utils import GraphDef as GraphDef
 from .nnx.helpers import Dict as Dict
 from .nnx.helpers import Sequence as Sequence

diff --git a/flax/experimental/nnx/examples/lm1b/models.py b/flax/experimental/nnx/examples/lm1b/models.py
@@ -33,7 +33,6 @@
 from jax import lax
 
 from flax.experimental import nnx
-from flax.experimental.nnx.nnx import flaglib
 from flax.experimental.nnx.examples.lm1b.configs import default
 
 Shape = tuple[int, ...]
@@ -126,9 +125,11 @@ def __init__(
     self,
     config: TransformerConfig,
     *,
+    decode: bool = False,
     rngs: nnx.Rngs,
   ):
     self.config = config
+    self.decode = decode
     self.pos_emb_shape = (1, config.max_len, config.emb_dim)
 
     if config.posemb_init is not None:
@@ -168,7 +169,7 @@ def __call__(self, inputs: jax.Array, inputs_positions=None):
       pos_embedding = self.pos_embedding.value
 
     # We use a cache position index for tracking decoding position.
-    if flaglib.flags.get('decode', False):
+    if self.decode:
       _, _, df = pos_embedding.shape
       # equivalent to pos_embedding[:, i:i+1] but traceable
       pos_embedding = lax.dynamic_slice(
@@ -336,9 +337,11 @@ def __init__(
     config: TransformerConfig,
     shared_embedding: nnx.Embed | None = None,
     *,
+    decode: bool = False,
     rngs: nnx.Rngs,
   ):
     self.config = config
+    self.decode = decode
     self.shared_embedding = shared_embedding
 
     # Target Embedding
@@ -413,7 +416,7 @@ def __call__(
     assert inputs.ndim == 2  # (batch, len)
 
     y = inputs.astype('int32')
-    if not flaglib.flags.get('decode', False):
+    if not self.decode:
       y = shift_inputs(y, segment_ids=inputs_segmentation)
     y = self.output_embed(y)
     y = self.posembed_output(y, inputs_positions=inputs_positions)
@@ -450,8 +453,11 @@ class TransformerLM(nnx.Module):
     config: TransformerConfig dataclass containing hyperparameters.
   """
 
-  def __init__(self, config: TransformerConfig, *, rngs: nnx.Rngs):
+  def __init__(
+    self, config: TransformerConfig, *, decode: bool = False, rngs: nnx.Rngs
+  ):
     self.config = config
+    self.decode = decode
     self.decoder = Decoder(config=config, shared_embedding=None, rngs=rngs)
 
   def __call__(
@@ -475,7 +481,7 @@ def __call__(
     config = self.config
 
     # Make padding attention masks.
-    if flaglib.flags.get('decode', False):
+    if self.decode:
       # for fast autoregressive decoding we use no decoder mask
       decoder_mask = None
     else:

diff --git a/flax/experimental/nnx/examples/lm1b/models_test.py b/flax/experimental/nnx/examples/lm1b/models_test.py
@@ -217,8 +217,8 @@ def test_forward_eval(self):
     self.transfer_params(config, params_nnx, params_linen)
     model_nnx.update(params_nnx)
 
-    with nnx.flags(deterministic=True, decode=False):
-      output_nnx = model_nnx(sample_inputs)
+    model_nnx.set_attributes(deterministic=True, decode=False)
+    output_nnx = model_nnx(sample_inputs)
 
     output_linen: jax.Array = model_linen.apply(
       {'params': params_linen}, sample_inputs
@@ -263,13 +263,13 @@ def test_forward_decode(self):
     self.transfer_params(config, params_nnx, params_linen)
     self.transfer_cache(config, cache_nnx, cache_linen)
     model_nnx.update(params_nnx, cache_nnx)
+    model_nnx.set_attributes(deterministic=True, decode=True)
 
     outputs_nnx = []
     outputs_linen = []
 
     for inputs in ar_decode_inputs:
-      with nnx.flags(deterministic=True, decode=True):
-        output_nnx = model_nnx(inputs)
+      output_nnx = model_nnx(inputs)
       outputs_nnx.append(output_nnx)
 
     output_linen: jax.Array

diff --git a/flax/experimental/nnx/examples/lm1b/train.py b/flax/experimental/nnx/examples/lm1b/train.py
@@ -194,13 +194,13 @@ def train_step(
   def loss_fn(params):
     """loss function used for training."""
     module = state.graphdef.merge(params)
-    with nnx.flags(deterministic=False, decode=False):
-      logits = module(
-        inputs,
-        inputs_positions=inputs_positions,
-        inputs_segmentation=inputs_segmentation,
-        rngs=nnx.Rngs(dropout=dropout_rng),
-      )
+    module.set_attributes(deterministic=False, decode=False)
+    logits = module(
+      inputs,
+      inputs_positions=inputs_positions,
+      inputs_segmentation=inputs_segmentation,
+      rngs=nnx.Rngs(dropout=dropout_rng),
+    )
 
     loss, weight_sum = compute_weighted_cross_entropy(
       logits, inputs, weights, label_smoothing
@@ -229,8 +229,8 @@ def eval_step(
   inputs = batch['inputs']
   weights = jnp.where(inputs > 0, 1.0, 0.0)
   module = static.merge(params)
-  with nnx.flags(deterministic=True, decode=False):
-    logits = module(inputs)
+  module.set_attributes(deterministic=True, decode=False)
+  logits = module(inputs)
 
   return compute_metrics(logits, inputs, weights, label_smoothing)
 
@@ -261,8 +261,8 @@ def tokens_ids_to_logits(flat_ids, cache: nnx.State):
     """Token slice to logits from decoder model."""
     # --> [batch * beam, 1, vocab]
     module = static.merge(params, cache)
-    with nnx.flags(deterministic=True, decode=True):
-      logits = module(flat_ids)
+    module.set_attributes(deterministic=True, decode=True)
+    logits = module(flat_ids)
     cache = module.extract(nnx.Cache)
     # Remove singleton sequence-length dimension:
     # [batch, 1, vocab] --> [batch, vocab]
@@ -538,7 +538,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
       predict_step,
       in_axes=(
         0,
-        jax.tree_map(lambda x: None, state.params),
+        jax.tree_util.tree_map(lambda x: None, state.params),
         0,
         None,
         None,

diff --git a/flax/experimental/nnx/examples/toy_examples/06_scan_over_layers.py b/flax/experimental/nnx/examples/toy_examples/06_scan_over_layers.py
@@ -80,8 +80,8 @@ def scan_fn(
 model = ScanMLP(10, n_layers=5, rngs=nnx.Rngs(0))
 
 x = jnp.ones((3, 10))
-with nnx.flags(deterministic=False):
-  y = model(x, rngs=nnx.Rngs(dropout=1))
+model.set_attributes(deterministic=False)
+y = model(x, rngs=nnx.Rngs(dropout=1))
 
 print(jax.tree_map(jnp.shape, model.get_state()))
 print(y.shape)
diff --git a/flax/experimental/nnx/nnx/flaglib.py b/flax/experimental/nnx/nnx/flaglib.py
diff --git a/flax/experimental/nnx/nnx/module.py b/flax/experimental/nnx/nnx/module.py
@@ -452,6 +452,64 @@ def modules(self) -> tp.Iterator[tuple[Path, Module]]:
       if isinstance(value, Module):
         yield path, value
 
+  def set_attributes(
+    self,
+    *filters: filterlib.Filter,
+    raise_if_not_found: bool = True,
+    **attributes: tp.Any,
+  ) -> None:
+    """Sets the attributes of nested Modules including the current Module.
+    If the attribute is not found in the Module, it is ignored.
+
+    Example::
+
+      >>> from flax.experimental import nnx
+      ...
+      >>> class Block(nnx.Module):
+      ...   def __init__(self, din, dout, *, rngs: nnx.Rngs):
+      ...     self.linear = nnx.Linear(din, dout, rngs=rngs)
+      ...     self.dropout = nnx.Dropout(0.5, deterministic=False)
+      ...     self.batch_norm = nnx.BatchNorm(10, use_running_average=False, rngs=rngs)
+      ...
+      >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+      >>> block.dropout.deterministic, block.batch_norm.use_running_average
+      (False, False)
+      >>> block.set_attributes(deterministic=True, use_running_average=True)
+      >>> block.dropout.deterministic, block.batch_norm.use_running_average
+      (True, True)
+
+    ``Filter``s can be used to set the attributes of specific Modules::
+
+      >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+      >>> block.set_attributes(nnx.Dropout, deterministic=True, use_running_average=True)
+      >>> # Only the dropout will be modified
+      >>> block.dropout.deterministic, block.batch_norm.use_running_average
+      (True, False)
+
+    Args:
+      *filters: Filters to select the Modules to set the attributes of.
+      raise_if_not_found: If True (default), raises a ValueError if at least one attribute
+        instance is not found in one of the selected Modules.
+      **attributes: The attributes to set.
+    """
+    remaining_attributes = set(attributes.keys())
+    if not filters:
+      filters = (True,)
+    predicates = tuple(map(filterlib.to_predicate, filters))
+    for path, module in self.modules():
+      for predicate in predicates:
+        if predicate(path, module):
+          for name, value in attributes.items():
+            if hasattr(module, name):
+              remaining_attributes.remove(name)
+              setattr(module, name, value)
+          break
+
+    if remaining_attributes and raise_if_not_found:
+      raise ValueError(
+        f'Could not find at least one instance of the following attributes: {remaining_attributes}'
+      )
+
   def __init_subclass__(cls, experimental_pytree: bool = False) -> None:
     super().__init_subclass__()
 

diff --git a/flax/experimental/nnx/nnx/nn/attention.py b/flax/experimental/nnx/nnx/nn/attention.py
@@ -25,7 +25,6 @@
 
 from flax.experimental import nnx
 from flax.experimental.nnx.nnx import rnglib
-from flax.experimental.nnx.nnx import flaglib
 from flax.experimental.nnx.nnx.module import Module, first_from
 from flax.experimental.nnx.nnx.nn import initializers
 from flax.experimental.nnx.nnx.nn.dtypes import promote_dtype
@@ -510,7 +509,6 @@ def __call__(
     decode = first_from(
       decode,
       self.decode,
-      flaglib.flags.get('decode'),
       error_msg="""No `decode` argument was provided to MultiHeadAttention
         as either a __call__ argument, class attribute, or nnx.flag.""",
     )
@@ -557,7 +555,6 @@ def __call__(
       deterministic = first_from(
         deterministic,
         self.deterministic,
-        flaglib.flags.get('deterministic'),
         error_msg="""No `deterministic` argument was provided to MultiHeadAttention
           as either a __call__ argument, class attribute, or nnx.flag.""",
       )

diff --git a/flax/experimental/nnx/nnx/nn/normalization.py b/flax/experimental/nnx/nnx/nn/normalization.py
@@ -19,7 +19,7 @@
 from jax import lax
 
 from flax.experimental import nnx
-from flax.experimental.nnx.nnx import flaglib, rnglib
+from flax.experimental.nnx.nnx import rnglib
 from flax.experimental.nnx.nnx.module import Module, first_from
 from flax.experimental.nnx.nnx.nn import dtypes, initializers
 from flax.typing import (
@@ -283,7 +283,6 @@ def __call__(
     use_running_average = first_from(
       use_running_average,
       self.use_running_average,
-      flaglib.flags.get('use_running_average'),
       error_msg="""No `use_running_average` argument was provided to BatchNorm
         as either a __call__ argument, class attribute, or nnx.flag.""",
     )

diff --git a/flax/experimental/nnx/nnx/nn/stochastic.py b/flax/experimental/nnx/nnx/nn/stochastic.py
@@ -17,7 +17,7 @@
 import jax.numpy as jnp
 from jax import lax, random
 
-from flax.experimental.nnx.nnx import flaglib, rnglib
+from flax.experimental.nnx.nnx import rnglib
 from flax.experimental.nnx.nnx.module import Module, first_from
 import dataclasses
 
@@ -61,7 +61,6 @@ def __call__(
     deterministic = first_from(
       deterministic,
       self.deterministic,
-      flaglib.flags.get('deterministic'),
       error_msg="""No `deterministic` argument was provided to Dropout
           as either a __call__ argument, class attribute, or nnx.flag.""",
     )

diff --git a/flax/experimental/nnx/tests/nn/test_attention.py b/flax/experimental/nnx/tests/nn/test_attention.py
@@ -65,9 +65,9 @@ def __call__(self, x, sow_weights=False):
       ),
       rng,
     )
+    module.set_attributes(decode=False)
 
-    with nnx.flags(decode=False):
-      _ = module(x, True)
+    _ = module(x, True)
     intermediates = module.pop(nnx.Intermediate)
     assert intermediates['attention_layers/0/attention_weights'].raw_value[
       0
@@ -77,8 +77,7 @@ def __call__(self, x, sow_weights=False):
       0
     ].shape == (4, 8, 6, 6)
 
-    with nnx.flags(decode=False):
-      _ = module(x)
+    _ = module(x)
     intermediates = module.pop(nnx.Intermediate)
     assert not intermediates  # empty