apax-hub · M-R-Schaefer · Apr 8, 2024 · Mar 18, 2024 · Mar 22, 2024 · Apr 4, 2024
diff --git a/apax/config/train_config.py b/apax/config/train_config.py
@@ -331,13 +331,16 @@ class Config(BaseModel, frozen=True, extra="forbid"):
     callbacks: List of :class: `callback` <config.CallbackConfig> configurations.
     progress_bar: Progressbar configuration.
     checkpoints: Checkpoint configuration.
+    data_parallel: Automatically uses all available GPUs for data parallel training.
+        Set to false to force single device training.
     """
 
     n_epochs: PositiveInt
     patience: Optional[PositiveInt] = None
     seed: int = 1
     n_models: int = 1
     n_jitted_steps: int = 1
+    data_parallel: int = True
 
     data: DataConfig
     model: ModelConfig = ModelConfig()

diff --git a/apax/data/input_pipeline.py b/apax/data/input_pipeline.py
@@ -205,7 +205,7 @@ def __iter__(self):
                 space = self.n_data - self.count
             self.enqueue(space)
 
-    def shuffle_and_batch(self):
+    def shuffle_and_batch(self, sharding=None):
         """Shuffles and batches the inputs/labels. This function prepares the
         inputs and labels for the whole training and prefetches the data.
 
@@ -227,10 +227,12 @@ def shuffle_and_batch(self):
         ).batch(batch_size=self.batch_size)
         if self.n_jit_steps > 1:
             ds = ds.batch(batch_size=self.n_jit_steps)
-        ds = prefetch_to_single_device(ds.as_numpy_iterator(), 2)
+        ds = prefetch_to_single_device(
+            ds.as_numpy_iterator(), 2, sharding, n_step_jit=self.n_jit_steps > 1
+        )
         return ds
 
-    def batch(self) -> Iterator[jax.Array]:
+    def batch(self, sharding=None) -> Iterator[jax.Array]:
         ds = (
             tf.data.Dataset.from_generator(
                 lambda: self, output_signature=self.make_signature()
@@ -239,7 +241,9 @@ def batch(self) -> Iterator[jax.Array]:
             .repeat(self.n_epochs)
         )
         ds = ds.batch(batch_size=self.batch_size)
-        ds = prefetch_to_single_device(ds.as_numpy_iterator(), 2)
+        ds = prefetch_to_single_device(
+            ds.as_numpy_iterator(), 2, sharding, n_step_jit=self.n_jit_steps > 1
+        )
         return ds
 
     def cleanup(self):
@@ -265,7 +269,7 @@ def __iter__(self):
                 self.count = 0
             self.enqueue(space)
 
-    def shuffle_and_batch(self):
+    def shuffle_and_batch(self, sharding=None):
         """Shuffles and batches the inputs/labels. This function prepares the
         inputs and labels for the whole training and prefetches the data.
 
@@ -283,15 +287,19 @@ def shuffle_and_batch(self):
         ).batch(batch_size=self.batch_size)
         if self.n_jit_steps > 1:
             ds = ds.batch(batch_size=self.n_jit_steps)
-        ds = prefetch_to_single_device(ds.as_numpy_iterator(), 2)
+        ds = prefetch_to_single_device(
+            ds.as_numpy_iterator(), 2, sharding, n_step_jit=self.n_jit_steps > 1
+        )
         return ds
 
-    def batch(self) -> Iterator[jax.Array]:
+    def batch(self, sharding=None) -> Iterator[jax.Array]:
         ds = tf.data.Dataset.from_generator(
             lambda: self, output_signature=self.make_signature()
         )
         ds = ds.batch(batch_size=self.batch_size)
-        ds = prefetch_to_single_device(ds.as_numpy_iterator(), 2)
+        ds = prefetch_to_single_device(
+            ds.as_numpy_iterator(), 2, sharding, n_step_jit=self.n_jit_steps > 1
+        )
         return ds
 
 

diff --git a/apax/data/preprocessing.py b/apax/data/preprocessing.py
@@ -53,16 +53,32 @@ def get_shrink_wrapped_cell(positions):
     return cell, cell_origin
 
 
-def prefetch_to_single_device(iterator, size: int):
+def prefetch_to_single_device(iterator, size: int, sharding=None, n_step_jit=False):
     """
     inspired by
     https://flax.readthedocs.io/en/latest/_modules/flax/jax_utils.html#prefetch_to_device
     except it does not shard the data.
     """
     queue = collections.deque()
 
-    def _prefetch(x):
-        return jnp.asarray(x)
+    if sharding:
+        n_devices = len(sharding._devices)
+        slice_start = 1
+        shape = [n_devices]
+        if n_step_jit:
+            # replicate over multi-batch axis
+            # data shape: njit x bs x ...
+            slice_start = 2
+            shape.insert(0, 1)
+
+    def _prefetch(x: jax.Array):
+        if sharding:
+            remaining_axes = [1] * len(x.shape[slice_start:])
+            final_shape = tuple(shape + remaining_axes)
+            x = jax.device_put(x, sharding.reshape(final_shape))
+        else:
+            x = jnp.asarray(x)
+        return x
 
     def enqueue(n):
         for data in itertools.islice(iterator, n):

diff --git a/apax/train/run.py b/apax/train/run.py
@@ -150,5 +150,6 @@ def run(user_config, log_level="error"):
         disable_pbar=config.progress_bar.disable_epoch_pbar,
         disable_batch_pbar=config.progress_bar.disable_batch_pbar,
         is_ensemble=config.n_models > 1,
+        data_parallel=config.data_parallel,
     )
     log.info("Finished training")
diff --git a/apax/train/trainer.py b/apax/train/trainer.py
@@ -8,6 +8,8 @@
 import jax.numpy as jnp
 import numpy as np
 from clu import metrics
+from jax.experimental import mesh_utils
+from jax.sharding import PositionalSharding
 from tqdm import trange
 
 from apax.data.input_pipeline import InMemoryDataset
@@ -31,6 +33,7 @@ def fit(
     disable_pbar: bool = False,
     disable_batch_pbar: bool = True,
     is_ensemble=False,
+    data_parallel=True,
 ):
     log.info("Beginning Training")
     callbacks.on_train_begin()
@@ -51,12 +54,19 @@ def fit(
             f"n_epochs <= current epoch from checkpoint ({n_epochs} <= {start_epoch})"
         )
 
+    devices = len(jax.devices())
+    if devices > 1 and data_parallel:
+        sharding = PositionalSharding(mesh_utils.create_device_mesh((devices,)))
+        state = jax.device_put(state, sharding.replicate())
+    else:
+        sharding = None
+
     train_steps_per_epoch = train_ds.steps_per_epoch()
-    batch_train_ds = train_ds.shuffle_and_batch()
+    batch_train_ds = train_ds.shuffle_and_batch(sharding)
 
     if val_ds is not None:
         val_steps_per_epoch = val_ds.steps_per_epoch()
-        batch_val_ds = val_ds.batch()
+        batch_val_ds = val_ds.batch(sharding)
 
     best_loss = np.inf
     early_stopping_counter = 0
@@ -85,6 +95,9 @@ def fit(
             callbacks.on_train_batch_begin(batch=batch_idx)
 
             batch = next(batch_train_ds)
+
+            # print(jax.tree_map(lambda x: x.devices(), batch))
+
             (
                 (state, train_batch_metrics),
                 batch_loss,