From a9475d65f869b6f6f51e519bde0bfa39557d6a20 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 11:17:36 +0200 Subject: [PATCH 1/6] Implement super_squash_history endpoint --- docs/source/en/guides/upload.md | 18 +++---- src/huggingface_hub/__init__.py | 2 + src/huggingface_hub/hf_api.py | 86 +++++++++++++++++++++++++++++++++ tests/test_hf_api.py | 33 +++++++++++++ 4 files changed, 130 insertions(+), 9 deletions(-) diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md index 9de33e6572..c897d471ca 100644 --- a/docs/source/en/guides/upload.md +++ b/docs/source/en/guides/upload.md @@ -384,14 +384,14 @@ getting an upload/push to fail at the end of the process or encountering a degra We gathered a list of tips and recommendations for structuring your repo. -| Characteristic | Recommended | Tips | -| ---------------- | ------------------ | ---------------------------------------- | -| Repo size | - | contact us for large repos (TBs of data) | -| Files per repo | <100k | merge data into fewer files | -| Entries per folder | <10k | use subdirectories in repo | -| File size | <5GB | split data into chunked files | -| Commit size | <100 files* | upload files in multiple commits | -| Commits per repo | - | upload multiple files per commit | +| Characteristic | Recommended | Tips | +| ---------------- | ------------------ | ------------------------------------------------------ | +| Repo size | - | contact us for large repos (TBs of data) | +| Files per repo | <100k | merge data into fewer files | +| Entries per folder | <10k | use subdirectories in repo | +| File size | <5GB | split data into chunked files | +| Commit size | <100 files* | upload files in multiple commits | +| Commits per repo | - | upload multiple files per commit and/or squash history | _* Not relevant when using `git` CLI directly_ @@ -424,7 +424,7 @@ In all cases no single LFS file will be able to be >50GB. I.e. 50GB is the hard our experience, the user experience on the Hub starts to degrade after a few thousand commits. We are constantly working to improve the service, but one must always remember that a git repository is not meant to work as a database with a lot of writes. If your repo's history gets very large, it is always possible to squash all the commits to get a -fresh start. +fresh start using [`super_squash_history`]. This is a non-revertible operation. - **Number of operations per commit**: Once again, there is no hard limit here. When a commit is uploaded on the Hub, each git operation (addition or delete) is checked by the server. When a hundred LFS files are committed at once, each file is checked individually to ensure it's been correctly uploaded. When pushing data through HTTP with `huggingface_hub`, diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 126307c534..113a8cadd8 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -197,6 +197,7 @@ "run_as_future", "set_space_sleep_time", "space_info", + "super_squash_history", "unlike", "update_repo_visibility", "upload_file", @@ -505,6 +506,7 @@ def __dir__(): run_as_future, # noqa: F401 set_space_sleep_time, # noqa: F401 space_info, # noqa: F401 + super_squash_history, # noqa: F401 unlike, # noqa: F401 update_repo_visibility, # noqa: F401 upload_file, # noqa: F401 diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 01156069a1..a585c40405 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -2355,6 +2355,91 @@ def list_repo_commits( ) ] + @validate_hf_hub_args + def super_squash_history( + self, + repo_id: str, + *, + branch: Optional[str] = None, + commit_message: Optional[str] = None, + repo_type: Optional[str] = None, + token: Optional[str] = None, + ) -> None: + """Squash commit history on a branch for a repo on the Hub. + + Squashing the repo history is useful when you know you'll make hundreds of commits and you don't want to + clutter the history. Squashing commits can only be performed from the head of a branch. + + + + Once squashed, the commit history cannot be retrieved. This is a non-revertible operation. + + + + + + Once the history of a branch has been squashed, it is not possible to merge it back into another branch since + their history will have diverged. + + + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated by a `/`. + branch (`str`, *optional*): + The branch to squash. Defaults to the head of the `"main"` branch. + commit_message (`str`, *optional*): + The commit message to use for the squashed commit. + repo_type (`str`, *optional*): + Set to `"dataset"` or `"space"` if listing commits from a dataset or a Space, `None` or `"model"` if + listing from a model. Default is `None`. + token (`str`, *optional*): + A valid authentication token (see https://huggingface.co/settings/token). If the machine is logged in + (through `huggingface-cli login` or [`~huggingface_hub.login`]), token can be automatically retrieved + from the cache. + + Raises: + [`~utils.RepositoryNotFoundError`]: + If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo + does not exist. + [`~utils.RevisionNotFoundError`]: + If the branch to squash cannot be found. + [`~utils.BadRequestError`]: + If invalid reference for a branch. You cannot squash history on tags. + + Example: + ``` + >>> from huggingface_hub import HfApi + >>> api = HfApi() + + # Create repo + >>> repo_id = api.create_repo("test-squash").repo_id + + # Make a lot of commits. + >>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content") + >>> api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content") + >>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content") + + # Squash history + >>> api.super_squash_history(repo_id=repo_id) + ``` + """ + if repo_type is None: + repo_type = REPO_TYPE_MODEL + if repo_type not in REPO_TYPES: + raise ValueError("Invalid repo type") + if branch is None: + branch = DEFAULT_REVISION + + # Prepare request + url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/super-squash/{branch}" + headers = self._build_hf_headers(token=token, is_write_action=True) + commit_message = commit_message or f"Super-squash branch '{branch}' using huggingface_hub" + + # Super-squash + response = get_session().post(url=url, headers=headers, json={"message": commit_message}) + hf_raise_for_status(response) + @validate_hf_hub_args def create_repo( self, @@ -5761,6 +5846,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str: create_repo = api.create_repo delete_repo = api.delete_repo update_repo_visibility = api.update_repo_visibility +super_squash_history = api.super_squash_history move_repo = api.move_repo upload_file = api.upload_file upload_folder = api.upload_folder diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index e100cd1cbd..dc1b20c84d 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2416,6 +2416,39 @@ def test_list_likes_on_production(self) -> None: self.assertGreater(len(likes.spaces), 0) +class TestSquashHistory(HfApiCommonTest): + @use_tmp_repo() + def test_super_squash_history(self, repo_url: RepoUrl) -> None: + # Upload + update file on main + repo_id = repo_url.repo_id + self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content") + self._api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content") + self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content") + + # Upload file on a new branch + self._api.create_branch(repo_id=repo_id, branch="v0.1", exist_ok=True) + self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"foo", revision="v0.1") + + # Squash history on main + self._api.super_squash_history(repo_id=repo_id) + + # List history + squashed_main_commits = self._api.list_repo_commits(repo_id=repo_id, revision="main") + branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1") + + # Main branch has been squashed but initial commits still exists on other branch + self.assertEqual(len(squashed_main_commits), 1) + self.assertEqual(squashed_main_commits[0], "Super-squash branch 'main' using huggingface_hub") + self.assertEqual(len(branch_commits), 5) + self.assertEqual(branch_commits[-1].title, "initial commit") + + # Squash history on branch + self._api.super_squash_history(repo_id=repo_id, branch="v0.1") + squashed_branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1") + self.assertEqual(len(squashed_branch_commits), 1) + self.assertEqual(squashed_branch_commits[0], "Super-squash branch 'v0.1' using huggingface_hub") + + @pytest.mark.usefixtures("fx_production_space") class TestSpaceAPIProduction(unittest.TestCase): """ From c190414f7e838dab0ca53a172f2fbd1795ee5de3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 11:41:41 +0200 Subject: [PATCH 2/6] Add squash_history option to commit scheduler --- src/huggingface_hub/_commit_scheduler.py | 11 +++++++++- tests/test_commit_scheduler.py | 26 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/_commit_scheduler.py b/src/huggingface_hub/_commit_scheduler.py index e190693e38..80d8dac786 100644 --- a/src/huggingface_hub/_commit_scheduler.py +++ b/src/huggingface_hub/_commit_scheduler.py @@ -56,6 +56,9 @@ class CommitScheduler: If provided, only files matching at least one pattern are uploaded. ignore_patterns (`List[str]` or `str`, *optional*): If provided, files matching any of the patterns are not uploaded. + squash_history (`bool`, *optional*): + Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is + useful to avoid degraded performances on the repo when it grows too large. hf_api (`HfApi`, *optional*): The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...). @@ -90,6 +93,7 @@ def __init__( token: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, + squash_history: bool = False, hf_api: Optional["HfApi"] = None, ) -> None: self.api = hf_api or HfApi(token=token) @@ -124,6 +128,7 @@ def __init__( raise ValueError(f"'every' must be a positive integer, not '{every}'.") self.lock = Lock() self.every = every + self.squash_history = squash_history logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.") self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True) @@ -161,7 +166,11 @@ def _push_to_hub(self) -> Optional[CommitInfo]: logger.info("(Background) scheduled commit triggered.") try: - return self.push_to_hub() + value = self.push_to_hub() + if self.squash_history: + logger.info("(Background) squashing repo history.") + self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision) + return value except Exception as e: logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced raise diff --git a/tests/test_commit_scheduler.py b/tests/test_commit_scheduler.py index 8cf8f9175c..c2fe5cc275 100644 --- a/tests/test_commit_scheduler.py +++ b/tests/test_commit_scheduler.py @@ -134,6 +134,32 @@ def _download(filename: str, revision: str) -> Path: self.assertEqual(lfs_push2.read_text(), "binary content") self.assertEqual(lfs_push3.read_text(), "binary content updated") + def test_sync_and_squash_history(self) -> None: + """Test squash history when pushing to the Hub.""" + watched_folder = self.cache_dir / "watched_folder" + watched_folder.mkdir(exist_ok=True, parents=True) + file_path = watched_folder / "file.txt" + with file_path.open("a") as f: + f.write("first line\n") + + self.scheduler = CommitScheduler( + folder_path=watched_folder, + repo_id=self.repo_name, + every=1 / 60, # every 0.1s + hf_api=self.api, + squash_history=True, + ) + + # At least 1 push to hub triggered + time.sleep(0.5) + self.scheduler.stop() + self.scheduler.last_future.result() + + # Branch history has been squashed + commits = self.api.list_repo_commits(repo_id=self.scheduler.repo_id) + self.assertEqual(len(commits), 1) + self.assertEqual(commits[0].title, "Super-squash branch 'main' using huggingface_hub") + @pytest.mark.usefixtures("fx_cache_dir") class TestPartialFileIO(unittest.TestCase): From 8836b73fed9c8789491d6e208bee806365b1ff00 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 11:48:49 +0200 Subject: [PATCH 3/6] add super squash to HfSummaryWriter as well --- src/huggingface_hub/_tensorboard_logger.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/huggingface_hub/_tensorboard_logger.py b/src/huggingface_hub/_tensorboard_logger.py index 87c5e7a53c..1abf55b931 100644 --- a/src/huggingface_hub/_tensorboard_logger.py +++ b/src/huggingface_hub/_tensorboard_logger.py @@ -54,6 +54,9 @@ class HFSummaryWriter(SummaryWriter): underlying `SummaryWriter` object. commit_every (`int` or `float`, *optional*): The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes. + squash_history (`bool`, *optional*): + Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is + useful to avoid degraded performances on the repo when it grows too large. repo_type (`str`, *optional*): The type of the repo to which the logs will be pushed. Defaults to "model". repo_revision (`str`, *optional*): @@ -114,6 +117,7 @@ def __init__( *, logdir: Optional[str] = None, commit_every: Union[int, float] = 5, + squash_history: bool = False, repo_type: Optional[str] = None, repo_revision: Optional[str] = None, repo_private: bool = False, @@ -148,8 +152,14 @@ def __init__( allow_patterns=repo_allow_patterns, ignore_patterns=repo_ignore_patterns, every=commit_every, + squash_history=squash_history, ) + # Exposing some high-level info at root level + self.repo_id = self.scheduler.repo_id + self.repo_type = self.scheduler.repo_type + self.repo_revision = self.scheduler.repo_revision + def __exit__(self, exc_type, exc_val, exc_tb): """Push to hub in a non-blocking way when exiting the logger's context manager.""" super().__exit__(exc_type, exc_val, exc_tb) From 0469ea87acb960114fc8d258c568b9c270ec155c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 11:51:23 +0200 Subject: [PATCH 4/6] make quality --- src/huggingface_hub/_tensorboard_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/_tensorboard_logger.py b/src/huggingface_hub/_tensorboard_logger.py index 1abf55b931..c48720c655 100644 --- a/src/huggingface_hub/_tensorboard_logger.py +++ b/src/huggingface_hub/_tensorboard_logger.py @@ -158,7 +158,7 @@ def __init__( # Exposing some high-level info at root level self.repo_id = self.scheduler.repo_id self.repo_type = self.scheduler.repo_type - self.repo_revision = self.scheduler.repo_revision + self.repo_revision = self.scheduler.revision def __exit__(self, exc_type, exc_val, exc_tb): """Push to hub in a non-blocking way when exiting the logger's context manager.""" From 414b31d67446b3c22d0554b5d34e9bec7244c917 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 11:55:37 +0200 Subject: [PATCH 5/6] style doc --- src/huggingface_hub/hf_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index a585c40405..747e033c2d 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -2408,7 +2408,7 @@ def super_squash_history( If invalid reference for a branch. You cannot squash history on tags. Example: - ``` + ```py >>> from huggingface_hub import HfApi >>> api = HfApi() From bb6c3574d16a1dfa476be54192b4b9eb97427217 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 5 Sep 2023 15:08:26 +0200 Subject: [PATCH 6/6] fix test --- tests/test_hf_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index dc1b20c84d..518b4f7ad2 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2438,7 +2438,7 @@ def test_super_squash_history(self, repo_url: RepoUrl) -> None: # Main branch has been squashed but initial commits still exists on other branch self.assertEqual(len(squashed_main_commits), 1) - self.assertEqual(squashed_main_commits[0], "Super-squash branch 'main' using huggingface_hub") + self.assertEqual(squashed_main_commits[0].title, "Super-squash branch 'main' using huggingface_hub") self.assertEqual(len(branch_commits), 5) self.assertEqual(branch_commits[-1].title, "initial commit") @@ -2446,7 +2446,7 @@ def test_super_squash_history(self, repo_url: RepoUrl) -> None: self._api.super_squash_history(repo_id=repo_id, branch="v0.1") squashed_branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1") self.assertEqual(len(squashed_branch_commits), 1) - self.assertEqual(squashed_branch_commits[0], "Super-squash branch 'v0.1' using huggingface_hub") + self.assertEqual(squashed_branch_commits[0].title, "Super-squash branch 'v0.1' using huggingface_hub") @pytest.mark.usefixtures("fx_production_space")