Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent empty commits if files did not change #2389

Merged
merged 7 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
get_session,
hf_raise_for_status,
logging,
sha,
tqdm_stream_file,
validate_hf_hub_args,
)
Expand Down Expand Up @@ -146,6 +147,10 @@ class CommitOperationAdd:
# (server-side check)
_should_ignore: Optional[bool] = field(init=False, repr=False, default=None)

# set to the remote OID of the file if it has already been uploaded
# useful to determine if a commit will be empty or not
_remote_oid: Optional[str] = field(init=False, repr=False, default=None)

# set to True once the file has been uploaded as LFS
_is_uploaded: bool = field(init=False, repr=False, default=False)

Expand Down Expand Up @@ -246,6 +251,29 @@ def b64content(self) -> bytes:
with self.as_file() as file:
return base64.b64encode(file.read())

@property
def _local_oid(self) -> Optional[str]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of curiosity, how long does this take to compute on larger files?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Asking as large file uploads (without going through your fantastic large-file-upload method) are already taking quite a bit of time before uploading anything

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For LFS files (so any file larger than 10MB for text files or 1MB for binary files) it won't add any time as we already compute and have the sha256.
For non-LFS files i expect this to be quite fast unless there are many many small files

Copy link
Contributor Author

@Wauplin Wauplin Jul 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @julien-c said, it should be neglectable on regular files since they are small. In any case, all regular files are loaded in memory in the /commit payload and cannot be above 1GB so having 1000s of files is not a use case to optimize for.

"""Return the OID of the local file.

This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
If the file did not change, we won't upload it again to prevent empty commits.

For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
For regular files, the OID corresponds to the SHA1 of the file content.
Note: this is slightly different to git OID computation since the oid of an LFS file is usually the SHA1 of the
Wauplin marked this conversation as resolved.
Show resolved Hide resolved
pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
and more convenient client-side.
"""
if self._upload_mode is None:
return None
elif self._upload_mode == "lfs":
return self.upload_info.sha256.hex()
else:
# Regular file => compute sha1
# => no need to read by chunk since the file is guaranteed to be <=5MB.
with self.as_file() as file:
return sha.git_hash(file.read())


def _validate_path_in_repo(path_in_repo: str) -> str:
# Validate `path_in_repo` value to prevent a server-side issue
Expand Down Expand Up @@ -483,6 +511,7 @@ def _fetch_upload_modes(
# Fetch upload mode (LFS or regular) chunk by chunk.
upload_modes: Dict[str, UploadMode] = {}
should_ignore_info: Dict[str, bool] = {}
oid_info: Dict[str, Optional[str]] = {}

for chunk in chunk_iterable(additions, 256):
payload: Dict = {
Expand All @@ -491,7 +520,6 @@ def _fetch_upload_modes(
"path": op.path_in_repo,
"sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
"size": op.upload_info.size,
"sha": op.upload_info.sha256.hex(),
}
for op in chunk
]
Expand All @@ -509,11 +537,13 @@ def _fetch_upload_modes(
preupload_info = _validate_preupload_info(resp.json())
upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})

# Set upload mode for each addition operation
for addition in additions:
addition._upload_mode = upload_modes[addition.path_in_repo]
addition._should_ignore = should_ignore_info[addition.path_in_repo]
addition._remote_oid = oid_info[addition.path_in_repo]

# Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
# => empty files are uploaded as "regular" to still allow users to commit them.
Expand Down
40 changes: 40 additions & 0 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3779,6 +3779,46 @@ def create_commit(
num_threads=num_threads,
free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users
)

# Remove no-op operations (files that have not changed)
operations_without_no_op = []
for operation in operations:
if (
isinstance(operation, CommitOperationAdd)
and operation._remote_oid is not None
and operation._remote_oid == operation._local_oid
):
# File already exists on the Hub and has not changed: we can skip it.
logger.debug(f"Skipping upload for '{operation.path_in_repo}' as the file has not changed.")
continue
operations_without_no_op.append(operation)
if len(operations) != len(operations_without_no_op):
logger.info(
f"Removing {len(operations) - len(operations_without_no_op)} file(s) from commit that have not changed."
)

# Return early if empty commit
if len(operations_without_no_op) == 0:
logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.")

# Get latest commit info
try:
info = self.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
except RepositoryNotFoundError as e:
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
raise

# Return commit info based on latest commit
url_prefix = self.endpoint
if repo_type is not None and repo_type != REPO_TYPE_MODEL:
url_prefix = f"{url_prefix}/{repo_type}s"
return CommitInfo(
commit_url=f"{url_prefix}/{repo_id}/commit/{info.sha}",
commit_message=commit_message,
commit_description=commit_description,
oid=info.sha, # type: ignore[arg-type]
)

files_to_copy = _fetch_files_to_copy(
copies=copies,
repo_type=repo_type,
Expand Down
37 changes: 36 additions & 1 deletion src/huggingface_hub/utils/sha.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import BinaryIO, Optional

from .insecure_hashlib import sha256
from .insecure_hashlib import sha1, sha256


def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
Expand All @@ -27,3 +27,38 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
if not chunk:
break
return sha.digest()


def git_hash(data: bytes) -> str:
"""
Computes the sha1 hash of the given bytes, using the same algorithm as git.
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
for more details.

Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
the LFS file content when we want to compare LFS files.

Args:
data (`bytes`):
The data to compute the git-hash for.

Returns:
`str`: the git-hash of `data` as an hexadecimal string.

Example:
```python
>>> from huggingface_hub.utils.sha import git_hash
>>> git_hash(b"Hello, World!")
'b45ef6fec89518d314f546fd6c3025367b721684'
```
"""
# Taken from https://gist.github.com/msabramo/763200
# Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
sha = sha1()
sha.update(b"blob ")
sha.update(str(len(data)).encode())
sha.update(b"\0")
sha.update(data)
Comment on lines +60 to +63
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

return sha.hexdigest()
Loading
Loading