huggingface · amyeroberts · Sep 7, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -65,7 +65,7 @@
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
-from .pytorch_utils import ALL_LAYERNORM_LAYERS
+from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_less_than_1_11
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -82,6 +82,7 @@
     LabelSmoother,
     LengthGroupedSampler,
     SequentialDistributedSampler,
+    check_dataloader_randomsampler,
     distributed_broadcast_scalars,
     distributed_concat,
     find_batch_size,
@@ -215,6 +216,7 @@
 if TYPE_CHECKING:
     import optuna
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -1782,8 +1784,17 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                for _ in train_dataloader:
-                    break
+                sampler, is_random_sampler = check_dataloader_randomsampler(train_dataloader)
+
+                if is_torch_less_than_1_11 or not is_random_sampler:
+                    # We just need to begin an iteration to create the randomization of the sampler.
+                    for _ in train_dataloader:
+                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):

diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
@@ -55,6 +55,14 @@
 logger = logging.get_logger(__name__)
 
 
+def check_dataloader_randomsampler(dataloader):
+    if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, RandomSampler):
+        return dataloader.sampler, True
+    if hasattr(dataloader, "batch_sampler"):
+        return check_dataloader_randomsampler(dataloader.batch_sampler)
+    return dataloader.sampler, False
-def check_dataloader_randomsampler(dataloader):
-    if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, RandomSampler):
-        return dataloader.sampler, True
-    if hasattr(dataloader, "batch_sampler"):
-        return check_dataloader_randomsampler(dataloader.batch_sampler)
-    return dataloader.sampler, False
+def get_dataloader_sampler(dataloader):
+    if hasattr(dataloader, "sampler"):
+        return dataloader.sampler
+    if hasattr(dataloader, "batch_sampler"):
+        return get_dataloader_sampler(dataloader.batch_sampler)
-def check_dataloader_randomsampler(dataloader):
-    if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, RandomSampler):
-        return dataloader.sampler, True
-    if hasattr(dataloader, "batch_sampler"):
-        return check_dataloader_randomsampler(dataloader.batch_sampler)
-    return dataloader.sampler, False
+def get_dataloader_sampler(dataloader):
+    if hasattr(dataloader, "sampler"):
+        return dataloader.sampler
+    if hasattr(dataloader, "batch_sampler"):
+        return get_dataloader_sampler(dataloader.batch_sampler)
+
+
 def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]):
     if isinstance(tensor_or_array, torch.Tensor):
         if hasattr(torch, "atleast_1d"):