keras-team · fchollet · Jan 7, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/keras/src/backend/jax/distribution_lib.py b/keras/src/backend/jax/distribution_lib.py
@@ -100,7 +100,7 @@ def distribute_tensor(tensor, layout):
         return global_value
 
 
-def distribute_data_input(per_process_batch, layout):
+def distribute_data_input(per_process_batch, layout, batch_dim_name):
     """Distribute the input data with the corresponding layout.
 
     Note that the inputs here is a local worker batch. Within the local worker,
@@ -117,9 +117,13 @@ def distribute_data_input(per_process_batch, layout):
     if not isinstance(layout, jax.sharding.Sharding):
         layout = _to_jax_layout(layout)
 
-    mesh_shape = list(layout.mesh.shape.values())
-    num_model_replicas_total = mesh_shape[0]  # batch dimension of the mesh
-    mesh_model_dim_size = mesh_shape[1] if len(mesh_shape) > 1 else 1
+    num_model_replicas_total = layout.mesh.shape[batch_dim_name]
+
+    mesh_model_dim_size = 1
+    for name, dim_size in layout.mesh.shape.items():
+        if not name == batch_dim_name:
+            mesh_model_dim_size *= dim_size
+
     num_model_replicas_per_process = num_model_replicas_total / num_processes()
     per_process_batch_size = per_process_batch.shape[0]
 

diff --git a/keras/src/backend/jax/distribution_lib_test.py b/keras/src/backend/jax/distribution_lib_test.py
@@ -337,7 +337,9 @@ def test_distribute_data_input(self):
             mesh, jax.sharding.PartitionSpec("batch", None)
         )
 
-        result = backend_dlib.distribute_data_input(per_process_batch, layout)
+        result = backend_dlib.distribute_data_input(
+            per_process_batch, layout, "batch"
+        )
 
         # Check the shape of the global batch array
         self.assertEqual(

diff --git a/keras/src/backend/jax/trainer.py b/keras/src/backend/jax/trainer.py
@@ -1,5 +1,6 @@
 import collections
 import itertools
+from functools import partial
 
 import jax
 import numpy as np
@@ -988,15 +989,18 @@ def _get_jax_state(
 
 def _distribute_data(data, layouts=None):
     distribution = distribution_lib.distribution()
+
     if distribution is not None:
         if layouts is None:
             layouts = tree.map_structure(
                 lambda d: distribution.get_data_layout(d.shape),
                 data,
             )
-        return tree.map_structure(
-            jax_distribution_lib.distribute_data_input, data, layouts
+        jax_dist_data_input = partial(
+            jax_distribution_lib.distribute_data_input,
+            batch_dim_name=distribution._batch_dim_name,
         )
+        return tree.map_structure(jax_dist_data_input, data, layouts)
 
     return tree.map_structure(jax.device_put, data)