huggingface · mariosasko · Oct 16, 2023 · Sep 29, 2023 · Oct 2, 2023 · Oct 2, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -61,10 +61,10 @@ jobs:
           python -m spacy download fr_core_news_sm
       - name: Install dependencies (latest versions)
         if: ${{ matrix.deps_versions == 'deps-latest' }}
-        run: pip install --upgrade pyarrow huggingface-hub dill
+        run: pip install --upgrade pyarrow dill
       - name: Install depencencies (minimum versions)
         if: ${{ matrix.deps_versions != 'deps-latest' }}
-        run: pip install pyarrow==8.0.0 huggingface-hub==0.14.0 transformers dill==0.3.1.1
+        run: pip install pyarrow==8.0.0 transformers dill==0.3.1.1
       - name: Test with pytest
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/

diff --git a/setup.py b/setup.py
@@ -131,7 +131,7 @@
     "aiohttp",
     # To get datasets from the Datasets Hub on huggingface.co
     # minimum 0.14.0 to support HfFileSystem
-    "huggingface-hub>=0.14.0,<1.0.0",
+    "huggingface_hub @ git+https://github.com/huggingface/huggingface_hub.git@preupload-files-before-commit",
     # Utilities from PyPA to e.g., compare versions
     "packaging",
     # To parse YAML metadata from dataset cards

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -58,7 +58,14 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.compute as pc
-from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFolder
+from huggingface_hub import (
+    CommitOperationAdd,
+    CommitOperationDelete,
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    HfFolder,
+)
 from multiprocess import Pool
 from requests import HTTPError
 
@@ -5293,6 +5300,7 @@ def path_in_repo(_index, shard):
 
         uploaded_size = 0
         shards_path_in_repo = []
+        operations = []
         for index, shard in logging.tqdm(
             enumerate(itertools.chain([first_shard], shards_iter)),
             desc="Pushing dataset shards to the dataset hub",
@@ -5305,12 +5313,13 @@ def path_in_repo(_index, shard):
                 buffer = BytesIO()
                 shard.to_parquet(buffer)
                 uploaded_size += buffer.tell()
+                shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
+                api.preupload_lfs_files(repo_id, [shard_addition], token=token, repo_type="dataset", revision=branch)
                 _retry(
-                    api.upload_file,
+                    api.preupload_lfs_files,
                     func_kwargs={
-                        "path_or_fileobj": buffer.getvalue(),
-                        "path_in_repo": shard_path_in_repo,
                         "repo_id": repo_id,
+                        "additions": [shard_addition],
                         "token": token,
                         "repo_type": "dataset",
                         "revision": branch,
@@ -5321,6 +5330,7 @@ def path_in_repo(_index, shard):
                     max_retries=5,
                     max_wait_time=20.0,
                 )
+                operations.append(shard_addition)
             shards_path_in_repo.append(shard_path_in_repo)
 
         # Cleanup to remove unused files
@@ -5329,23 +5339,22 @@ def path_in_repo(_index, shard):
             for data_file in data_files
             if data_file.startswith(f"{data_dir}/{split}-") and data_file not in shards_path_in_repo
         ]
+        for data_file in data_files_to_delete:
+            operations.append(CommitOperationDelete(path_in_repo=data_file))
         download_config = DownloadConfig(token=token)
         deleted_size = sum(
             xgetsize(hf_hub_url(repo_id, data_file, revision=branch), download_config=download_config)
             for data_file in data_files_to_delete
         )
 
-        def delete_file(file):
-            api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)
-
-        if len(data_files_to_delete):
-            for data_file in logging.tqdm(
-                data_files_to_delete,
-                desc="Deleting unused files from dataset repository",
-                total=len(data_files_to_delete),
-                disable=not logging.is_progress_bar_enabled(),
-            ):
-                delete_file(data_file)
+        api.create_commit(
+            repo_id,
+            operations=operations,
+            token=token,
+            repo_type="dataset",
+            revision=branch,
+            commit_message="Uplod data files",
+        )
 
         repo_files = list(set(files) - set(data_files_to_delete))
 

diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -453,7 +453,9 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo):
     def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo):
         ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
         with temporary_repo() as ds_name:
-            with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api:
+            with patch(
+                "datasets.arrow_dataset.HfApi.preupload_lfs_files", side_effect=self._api.preupload_lfs_files
+            ) as mock_hf_api:
                 # Initial push
                 ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
                 call_count_old = mock_hf_api.call_count