From d8c965b72075c9ec7cb9c4551a48693fd859707c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20GALLOU=C3=89DEC?= Date: Thu, 19 Oct 2023 09:49:52 +0200 Subject: [PATCH 1/4] fix commit message --- src/datasets/dataset_dict.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 23c8dcf6c75..632511fa35f 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1801,13 +1801,13 @@ def push_to_hub( f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." ) num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) + commit_message = commit_message if commit_message is not None else "Upload dataset" for i in range(0, num_commits): operations = additions[ i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT ] + (deletions if i == 0 else []) - commit_message = ( - commit_message if commit_message is not None else "Upload dataset" - ) + f" (part {i:05d}-of-{num_commits:05d})" + part_number = f"{i:05d}-of-{num_commits:05d}" + commit_message = f"{commit_message} (part {part_number})" api.create_commit( repo_id, operations=operations, @@ -1824,6 +1824,7 @@ def push_to_hub( ) + class IterableDatasetDict(dict): def with_format( self, From 5188bf82663c4328e86161d58698988b148a9038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20GALLOU=C3=89DEC?= Date: Thu, 19 Oct 2023 09:53:47 +0200 Subject: [PATCH 2/4] fix the fix --- src/datasets/dataset_dict.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 632511fa35f..bd4725833fa 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1807,11 +1807,10 @@ def push_to_hub( i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT ] + (deletions if i == 0 else []) part_number = f"{i:05d}-of-{num_commits:05d}" - commit_message = f"{commit_message} (part {part_number})" api.create_commit( repo_id, operations=operations, - commit_message=commit_message, + commit_message=f"{commit_message} (part {part_number})", token=token, repo_type="dataset", revision=revision, @@ -1824,7 +1823,6 @@ def push_to_hub( ) - class IterableDatasetDict(dict): def with_format( self, From 3477bf75d76dbfbe0423544045fa07524537f92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Thu, 19 Oct 2023 19:15:26 +0200 Subject: [PATCH 3/4] Fix dataset too --- src/datasets/arrow_dataset.py | 8 +++----- src/datasets/dataset_dict.py | 7 +++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 445dc7452d4..65658c9ea94 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5513,11 +5513,12 @@ def push_to_hub( dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode())) + commit_message = commit_message if commit_message is not None else "Upload dataset" if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT: api.create_commit( repo_id, operations=additions + deletions, - commit_message=commit_message if commit_message is not None else "Upload dataset", + commit_message=commit_message, token=token, repo_type="dataset", revision=revision, @@ -5532,13 +5533,10 @@ def push_to_hub( operations = additions[ i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT ] + (deletions if i == 0 else []) - commit_message = ( - commit_message if commit_message is not None else "Upload dataset" - ) + f" (part {i:05d}-of-{num_commits:05d})" api.create_commit( repo_id, operations=operations, - commit_message=commit_message, + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", token=token, repo_type="dataset", revision=revision, diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index bd4725833fa..89c47a55d3c 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1786,11 +1786,12 @@ def push_to_hub( dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode())) + commit_message = commit_message if commit_message is not None else "Upload dataset" if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT: api.create_commit( repo_id, operations=additions + deletions, - commit_message=commit_message if commit_message is not None else "Upload dataset", + commit_message=commit_message, token=token, repo_type="dataset", revision=revision, @@ -1801,16 +1802,14 @@ def push_to_hub( f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." ) num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) - commit_message = commit_message if commit_message is not None else "Upload dataset" for i in range(0, num_commits): operations = additions[ i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT ] + (deletions if i == 0 else []) - part_number = f"{i:05d}-of-{num_commits:05d}" api.create_commit( repo_id, operations=operations, - commit_message=f"{commit_message} (part {part_number})", + commit_message=commit_message + f"{i:05d}-of-{num_commits:05d}", token=token, repo_type="dataset", revision=revision, From a69d636b33f420c7f595b8138057d4fd242f859e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Thu, 19 Oct 2023 19:34:34 +0200 Subject: [PATCH 4/4] Update src/datasets/dataset_dict.py --- src/datasets/dataset_dict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 149303160e6..4ef3bd2ec60 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1809,7 +1809,7 @@ def push_to_hub( api.create_commit( repo_id, operations=operations, - commit_message=commit_message + f"{i:05d}-of-{num_commits:05d}", + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", token=token, repo_type="dataset", revision=revision,