Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement super_squash_history in HfApi #1639

Merged
merged 7 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions docs/source/en/guides/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,14 +384,14 @@ getting an upload/push to fail at the end of the process or encountering a degra
We gathered a list of tips and recommendations for structuring your repo.


| Characteristic | Recommended | Tips |
| ---------------- | ------------------ | ---------------------------------------- |
| Repo size | - | contact us for large repos (TBs of data) |
| Files per repo | <100k | merge data into fewer files |
| Entries per folder | <10k | use subdirectories in repo |
| File size | <5GB | split data into chunked files |
| Commit size | <100 files* | upload files in multiple commits |
| Commits per repo | - | upload multiple files per commit |
| Characteristic | Recommended | Tips |
| ---------------- | ------------------ | ------------------------------------------------------ |
| Repo size | - | contact us for large repos (TBs of data) |
| Files per repo | <100k | merge data into fewer files |
| Entries per folder | <10k | use subdirectories in repo |
| File size | <5GB | split data into chunked files |
| Commit size | <100 files* | upload files in multiple commits |
| Commits per repo | - | upload multiple files per commit and/or squash history |

_* Not relevant when using `git` CLI directly_

Expand Down Expand Up @@ -424,7 +424,7 @@ In all cases no single LFS file will be able to be >50GB. I.e. 50GB is the hard
our experience, the user experience on the Hub starts to degrade after a few thousand commits. We are constantly working to
improve the service, but one must always remember that a git repository is not meant to work as a database with a lot of
writes. If your repo's history gets very large, it is always possible to squash all the commits to get a
fresh start.
fresh start using [`super_squash_history`]. This is a non-revertible operation.
- **Number of operations per commit**: Once again, there is no hard limit here. When a commit is uploaded on the Hub, each
git operation (addition or delete) is checked by the server. When a hundred LFS files are committed at once,
each file is checked individually to ensure it's been correctly uploaded. When pushing data through HTTP with `huggingface_hub`,
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"run_as_future",
"set_space_sleep_time",
"space_info",
"super_squash_history",
"unlike",
"update_repo_visibility",
"upload_file",
Expand Down Expand Up @@ -505,6 +506,7 @@ def __dir__():
run_as_future, # noqa: F401
set_space_sleep_time, # noqa: F401
space_info, # noqa: F401
super_squash_history, # noqa: F401
unlike, # noqa: F401
update_repo_visibility, # noqa: F401
upload_file, # noqa: F401
Expand Down
11 changes: 10 additions & 1 deletion src/huggingface_hub/_commit_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class CommitScheduler:
If provided, only files matching at least one pattern are uploaded.
ignore_patterns (`List[str]` or `str`, *optional*):
If provided, files matching any of the patterns are not uploaded.
squash_history (`bool`, *optional*):
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
useful to avoid degraded performances on the repo when it grows too large.
hf_api (`HfApi`, *optional*):
The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).

Expand Down Expand Up @@ -90,6 +93,7 @@ def __init__(
token: Optional[str] = None,
allow_patterns: Optional[Union[List[str], str]] = None,
ignore_patterns: Optional[Union[List[str], str]] = None,
squash_history: bool = False,
hf_api: Optional["HfApi"] = None,
) -> None:
self.api = hf_api or HfApi(token=token)
Expand Down Expand Up @@ -124,6 +128,7 @@ def __init__(
raise ValueError(f"'every' must be a positive integer, not '{every}'.")
self.lock = Lock()
self.every = every
self.squash_history = squash_history

logger.info(f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes.")
self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
Expand Down Expand Up @@ -161,7 +166,11 @@ def _push_to_hub(self) -> Optional[CommitInfo]:

logger.info("(Background) scheduled commit triggered.")
try:
return self.push_to_hub()
value = self.push_to_hub()
if self.squash_history:
logger.info("(Background) squashing repo history.")
self.api.super_squash_history(repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision)
return value
except Exception as e:
logger.error(f"Error while pushing to Hub: {e}") # Depending on the setup, error might be silenced
raise
Expand Down
10 changes: 10 additions & 0 deletions src/huggingface_hub/_tensorboard_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class HFSummaryWriter(SummaryWriter):
underlying `SummaryWriter` object.
commit_every (`int` or `float`, *optional*):
The frequency (in minutes) at which the logs will be pushed to the Hub. Defaults to 5 minutes.
squash_history (`bool`, *optional*):
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
useful to avoid degraded performances on the repo when it grows too large.
repo_type (`str`, *optional*):
The type of the repo to which the logs will be pushed. Defaults to "model".
repo_revision (`str`, *optional*):
Expand Down Expand Up @@ -114,6 +117,7 @@ def __init__(
*,
logdir: Optional[str] = None,
commit_every: Union[int, float] = 5,
squash_history: bool = False,
repo_type: Optional[str] = None,
repo_revision: Optional[str] = None,
repo_private: bool = False,
Expand Down Expand Up @@ -148,8 +152,14 @@ def __init__(
allow_patterns=repo_allow_patterns,
ignore_patterns=repo_ignore_patterns,
every=commit_every,
squash_history=squash_history,
)

# Exposing some high-level info at root level
self.repo_id = self.scheduler.repo_id
self.repo_type = self.scheduler.repo_type
self.repo_revision = self.scheduler.revision

def __exit__(self, exc_type, exc_val, exc_tb):
"""Push to hub in a non-blocking way when exiting the logger's context manager."""
super().__exit__(exc_type, exc_val, exc_tb)
Expand Down
86 changes: 86 additions & 0 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2355,6 +2355,91 @@ def list_repo_commits(
)
]

@validate_hf_hub_args
def super_squash_history(
self,
repo_id: str,
*,
branch: Optional[str] = None,
commit_message: Optional[str] = None,
repo_type: Optional[str] = None,
token: Optional[str] = None,
) -> None:
"""Squash commit history on a branch for a repo on the Hub.

Squashing the repo history is useful when you know you'll make hundreds of commits and you don't want to
clutter the history. Squashing commits can only be performed from the head of a branch.

<Tip warning={true}>

Once squashed, the commit history cannot be retrieved. This is a non-revertible operation.

</Tip>

<Tip warning={true}>

Once the history of a branch has been squashed, it is not possible to merge it back into another branch since
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd nearly put a "DESTRUCTIVE ACTION" in all caps 😁

their history will have diverged.

</Tip>

Args:
repo_id (`str`):
A namespace (user or an organization) and a repo name separated by a `/`.
branch (`str`, *optional*):
The branch to squash. Defaults to the head of the `"main"` branch.
commit_message (`str`, *optional*):
The commit message to use for the squashed commit.
repo_type (`str`, *optional*):
Set to `"dataset"` or `"space"` if listing commits from a dataset or a Space, `None` or `"model"` if
listing from a model. Default is `None`.
token (`str`, *optional*):
A valid authentication token (see https://huggingface.co/settings/token). If the machine is logged in
(through `huggingface-cli login` or [`~huggingface_hub.login`]), token can be automatically retrieved
from the cache.

Raises:
[`~utils.RepositoryNotFoundError`]:
If repository is not found (error 404): wrong repo_id/repo_type, private but not authenticated or repo
does not exist.
[`~utils.RevisionNotFoundError`]:
If the branch to squash cannot be found.
[`~utils.BadRequestError`]:
If invalid reference for a branch. You cannot squash history on tags.

Example:
```py
>>> from huggingface_hub import HfApi
>>> api = HfApi()

# Create repo
>>> repo_id = api.create_repo("test-squash").repo_id

# Make a lot of commits.
>>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content")
>>> api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content")
>>> api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content")

# Squash history
>>> api.super_squash_history(repo_id=repo_id)
```
"""
if repo_type is None:
repo_type = REPO_TYPE_MODEL
if repo_type not in REPO_TYPES:
raise ValueError("Invalid repo type")
if branch is None:
branch = DEFAULT_REVISION

# Prepare request
url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/super-squash/{branch}"
headers = self._build_hf_headers(token=token, is_write_action=True)
commit_message = commit_message or f"Super-squash branch '{branch}' using huggingface_hub"

# Super-squash
response = get_session().post(url=url, headers=headers, json={"message": commit_message})
hf_raise_for_status(response)

@validate_hf_hub_args
def create_repo(
self,
Expand Down Expand Up @@ -5761,6 +5846,7 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
create_repo = api.create_repo
delete_repo = api.delete_repo
update_repo_visibility = api.update_repo_visibility
super_squash_history = api.super_squash_history
move_repo = api.move_repo
upload_file = api.upload_file
upload_folder = api.upload_folder
Expand Down
26 changes: 26 additions & 0 deletions tests/test_commit_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,32 @@ def _download(filename: str, revision: str) -> Path:
self.assertEqual(lfs_push2.read_text(), "binary content")
self.assertEqual(lfs_push3.read_text(), "binary content updated")

def test_sync_and_squash_history(self) -> None:
"""Test squash history when pushing to the Hub."""
watched_folder = self.cache_dir / "watched_folder"
watched_folder.mkdir(exist_ok=True, parents=True)
file_path = watched_folder / "file.txt"
with file_path.open("a") as f:
f.write("first line\n")

self.scheduler = CommitScheduler(
folder_path=watched_folder,
repo_id=self.repo_name,
every=1 / 60, # every 0.1s
hf_api=self.api,
squash_history=True,
)

# At least 1 push to hub triggered
time.sleep(0.5)
self.scheduler.stop()
self.scheduler.last_future.result()

# Branch history has been squashed
commits = self.api.list_repo_commits(repo_id=self.scheduler.repo_id)
self.assertEqual(len(commits), 1)
self.assertEqual(commits[0].title, "Super-squash branch 'main' using huggingface_hub")


@pytest.mark.usefixtures("fx_cache_dir")
class TestPartialFileIO(unittest.TestCase):
Expand Down
33 changes: 33 additions & 0 deletions tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2416,6 +2416,39 @@ def test_list_likes_on_production(self) -> None:
self.assertGreater(len(likes.spaces), 0)


class TestSquashHistory(HfApiCommonTest):
@use_tmp_repo()
def test_super_squash_history(self, repo_url: RepoUrl) -> None:
# Upload + update file on main
repo_id = repo_url.repo_id
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"content")
self._api.upload_file(repo_id=repo_id, path_in_repo="lfs.bin", path_or_fileobj=b"content")
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"another_content")

# Upload file on a new branch
self._api.create_branch(repo_id=repo_id, branch="v0.1", exist_ok=True)
self._api.upload_file(repo_id=repo_id, path_in_repo="file.txt", path_or_fileobj=b"foo", revision="v0.1")

# Squash history on main
self._api.super_squash_history(repo_id=repo_id)

# List history
squashed_main_commits = self._api.list_repo_commits(repo_id=repo_id, revision="main")
branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1")

# Main branch has been squashed but initial commits still exists on other branch
self.assertEqual(len(squashed_main_commits), 1)
self.assertEqual(squashed_main_commits[0].title, "Super-squash branch 'main' using huggingface_hub")
self.assertEqual(len(branch_commits), 5)
self.assertEqual(branch_commits[-1].title, "initial commit")

# Squash history on branch
self._api.super_squash_history(repo_id=repo_id, branch="v0.1")
squashed_branch_commits = self._api.list_repo_commits(repo_id=repo_id, revision="v0.1")
self.assertEqual(len(squashed_branch_commits), 1)
self.assertEqual(squashed_branch_commits[0].title, "Super-squash branch 'v0.1' using huggingface_hub")


@pytest.mark.usefixtures("fx_production_space")
class TestSpaceAPIProduction(unittest.TestCase):
"""
Expand Down