comments + raise error if sharding is ambiguous

huggingface · Jun 9, 2022 · 8f5579e · 8f5579e · github-actions · Jun 9, 2022
1 parent 54e9f39
commit 8f5579e
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 2 deletions.
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -85,6 +85,19 @@ def _shuffle_kwargs(rng: np.random.Generator, kwargs: dict) -> dict:
 
 def _shard_kwargs(shard_idx: int, kwargs: dict) -> dict:
     """Return a copy of the input kwargs but with only one shard"""
+    # Having lists of different sizes makes sharding ambigious, raise an error in this case
+    # until we decide how to define sharding without ambiguity for users
+    lists_lengths = {key: len(value) for key, value in kwargs.items() if isinstance(value, list)}
+    if len(set(lists_lengths.values())) > 1:
+        raise RuntimeError(
+            (
+                "Sharding is ambiguous for this dataset: "
+                + "we found several data sources lists of different lengths, and we don't know over which list we should parallelize:\n"
+                + "\n".join(f"\t- key {key} has length {length}" for key, length in lists_lengths.items())
+                + "\nTo fix this, check the dataset script 'gen_kwargs' and make sure to use lists only for data sources, "
+                + "and use tuples otherwise. In the end there should only one single list, or several lists with the same length."
+            )
+        )
     return {key: [value[shard_idx]] if isinstance(value, list) else value for key, value in kwargs.items()}
 
 

diff --git a/src/datasets/utils/patching.py b/src/datasets/utils/patching.py
@@ -47,13 +47,18 @@ def __init__(self, obj, target: str, new, attrs=None):
 
     def __enter__(self):
         *submodules, target_attr = self.target.split(".")
+
         # Patch modules:
         # it's used to patch attributes of submodules like "os.path.join";
         # in this case we need to patch "os" and "os.path"
+
         for i in range(len(submodules)):
             submodule = import_module(".".join(submodules[: i + 1]))
+            # We iterate over all the globals in self.obj in case we find "os" or "os.path"
             for attr in self.obj.__dir__():
                 obj_attr = getattr(self.obj, attr)
+                # We don't check for the name of the global, but rather if its value *is* "os" or "os.path".
+                # This allows to patch renamed modules like "from os import path as ospath".
                 if obj_attr is submodule or (
                     (isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule)
                 ):
@@ -67,17 +72,22 @@ def __enter__(self):
                         patched = getattr(patched, key)
                     # finally set the target attribute
                     setattr(patched, target_attr, self.new)
+
         # Patch attribute itself:
         # it's used for builtins like "open",
         # and also to patch "os.path.join" we may also need to patch "join"
         # itself if it was imported as "from os.path import join".
-        if submodules:  # if it's an attribute of a submodule
+
+        if submodules:  # if it's an attribute of a submodule like "os.path.join"
             attr_value = getattr(import_module(".".join(submodules)), target_attr)
+            # We iterate over all the globals in self.obj in case we find "os.path.join"
             for attr in self.obj.__dir__():
+                # We don't check for the name of the global, but rather if its value *is* "os.path.join".
+                # This allows to patch renamed attributes like "from os.path import join as pjoin".
                 if getattr(self.obj, attr) is attr_value:
                     self.original[attr] = getattr(self.obj, attr)
                     setattr(self.obj, attr, self.new)
-        elif target_attr in globals()["__builtins__"]:  # if it'a s builtin
+        elif target_attr in globals()["__builtins__"]:  # if it'a s builtin like "open"
             self.original[target_attr] = globals()["__builtins__"][target_attr]
             setattr(self.obj, target_attr, self.new)
         else: