Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce the number of commits in push_to_hub #6269

Merged
merged 19 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ jobs:
python -m spacy download fr_core_news_sm
- name: Install dependencies (latest versions)
if: ${{ matrix.deps_versions == 'deps-latest' }}
run: pip install --upgrade pyarrow huggingface-hub dill
run: pip install --upgrade pyarrow dill
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
- name: Install depencencies (minimum versions)
if: ${{ matrix.deps_versions != 'deps-latest' }}
run: pip install pyarrow==8.0.0 huggingface-hub==0.14.0 transformers dill==0.3.1.1
run: pip install pyarrow==8.0.0 transformers dill==0.3.1.1
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
- name: Test with pytest
run: |
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"aiohttp",
# To get datasets from the Datasets Hub on huggingface.co
# minimum 0.14.0 to support HfFileSystem
"huggingface-hub>=0.14.0,<1.0.0",
"huggingface_hub @ git+https://github.com/huggingface/huggingface_hub.git@preupload-files-before-commit",
# Utilities from PyPA to e.g., compare versions
"packaging",
# To parse YAML metadata from dataset cards
Expand Down
39 changes: 24 additions & 15 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,14 @@
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFolder
from huggingface_hub import (
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
HfFolder,
)
from multiprocess import Pool
from requests import HTTPError

Expand Down Expand Up @@ -5293,6 +5300,7 @@ def path_in_repo(_index, shard):

uploaded_size = 0
shards_path_in_repo = []
operations = []
for index, shard in logging.tqdm(
enumerate(itertools.chain([first_shard], shards_iter)),
desc="Pushing dataset shards to the dataset hub",
Expand All @@ -5305,12 +5313,13 @@ def path_in_repo(_index, shard):
buffer = BytesIO()
shard.to_parquet(buffer)
uploaded_size += buffer.tell()
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
api.preupload_lfs_files(repo_id, [shard_addition], token=token, repo_type="dataset", revision=branch)
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
_retry(
api.upload_file,
api.preupload_lfs_files,
func_kwargs={
"path_or_fileobj": buffer.getvalue(),
"path_in_repo": shard_path_in_repo,
"repo_id": repo_id,
"additions": [shard_addition],
"token": token,
"repo_type": "dataset",
"revision": branch,
Expand All @@ -5321,6 +5330,7 @@ def path_in_repo(_index, shard):
max_retries=5,
max_wait_time=20.0,
)
operations.append(shard_addition)
shards_path_in_repo.append(shard_path_in_repo)

# Cleanup to remove unused files
Expand All @@ -5329,23 +5339,22 @@ def path_in_repo(_index, shard):
for data_file in data_files
if data_file.startswith(f"{data_dir}/{split}-") and data_file not in shards_path_in_repo
]
for data_file in data_files_to_delete:
operations.append(CommitOperationDelete(path_in_repo=data_file))
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
download_config = DownloadConfig(token=token)
deleted_size = sum(
xgetsize(hf_hub_url(repo_id, data_file, revision=branch), download_config=download_config)
for data_file in data_files_to_delete
)

def delete_file(file):
api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)

if len(data_files_to_delete):
for data_file in logging.tqdm(
data_files_to_delete,
desc="Deleting unused files from dataset repository",
total=len(data_files_to_delete),
disable=not logging.is_progress_bar_enabled(),
):
delete_file(data_file)
api.create_commit(
repo_id,
operations=operations,
token=token,
repo_type="dataset",
revision=branch,
commit_message="Uplod data files",
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
)

repo_files = list(set(files) - set(data_files_to_delete))

Expand Down
4 changes: 3 additions & 1 deletion tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo):
def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo):
ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
with temporary_repo() as ds_name:
with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api:
with patch(
"datasets.arrow_dataset.HfApi.preupload_lfs_files", side_effect=self._api.preupload_lfs_files
) as mock_hf_api:
# Initial push
ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
call_count_old = mock_hf_api.call_count
Expand Down
Loading