Skip to content

Commit

Permalink
Prevent empty commits if files did not change (#2389)
Browse files Browse the repository at this point in the history
* Prevent empty commits

* remove warnings in tests

* no newline in echo test

* fix windows?

* style

* Apply suggestions from code review

Co-authored-by: Julien Chaumond <julien@huggingface.co>

---------

Co-authored-by: Julien Chaumond <julien@huggingface.co>
  • Loading branch information
Wauplin and julien-c authored Jul 16, 2024
1 parent 05fdb76 commit e370fa6
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 59 deletions.
32 changes: 31 additions & 1 deletion src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
get_session,
hf_raise_for_status,
logging,
sha,
tqdm_stream_file,
validate_hf_hub_args,
)
Expand Down Expand Up @@ -146,6 +147,10 @@ class CommitOperationAdd:
# (server-side check)
_should_ignore: Optional[bool] = field(init=False, repr=False, default=None)

# set to the remote OID of the file if it has already been uploaded
# useful to determine if a commit will be empty or not
_remote_oid: Optional[str] = field(init=False, repr=False, default=None)

# set to True once the file has been uploaded as LFS
_is_uploaded: bool = field(init=False, repr=False, default=False)

Expand Down Expand Up @@ -246,6 +251,29 @@ def b64content(self) -> bytes:
with self.as_file() as file:
return base64.b64encode(file.read())

@property
def _local_oid(self) -> Optional[str]:
"""Return the OID of the local file.
This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
If the file did not change, we won't upload it again to prevent empty commits.
For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
For regular files, the OID corresponds to the SHA1 of the file content.
Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
and more convenient client-side.
"""
if self._upload_mode is None:
return None
elif self._upload_mode == "lfs":
return self.upload_info.sha256.hex()
else:
# Regular file => compute sha1
# => no need to read by chunk since the file is guaranteed to be <=5MB.
with self.as_file() as file:
return sha.git_hash(file.read())


def _validate_path_in_repo(path_in_repo: str) -> str:
# Validate `path_in_repo` value to prevent a server-side issue
Expand Down Expand Up @@ -483,6 +511,7 @@ def _fetch_upload_modes(
# Fetch upload mode (LFS or regular) chunk by chunk.
upload_modes: Dict[str, UploadMode] = {}
should_ignore_info: Dict[str, bool] = {}
oid_info: Dict[str, Optional[str]] = {}

for chunk in chunk_iterable(additions, 256):
payload: Dict = {
Expand All @@ -491,7 +520,6 @@ def _fetch_upload_modes(
"path": op.path_in_repo,
"sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
"size": op.upload_info.size,
"sha": op.upload_info.sha256.hex(),
}
for op in chunk
]
Expand All @@ -509,11 +537,13 @@ def _fetch_upload_modes(
preupload_info = _validate_preupload_info(resp.json())
upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})

# Set upload mode for each addition operation
for addition in additions:
addition._upload_mode = upload_modes[addition.path_in_repo]
addition._should_ignore = should_ignore_info[addition.path_in_repo]
addition._remote_oid = oid_info[addition.path_in_repo]

# Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
# => empty files are uploaded as "regular" to still allow users to commit them.
Expand Down
40 changes: 40 additions & 0 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3779,6 +3779,46 @@ def create_commit(
num_threads=num_threads,
free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users
)

# Remove no-op operations (files that have not changed)
operations_without_no_op = []
for operation in operations:
if (
isinstance(operation, CommitOperationAdd)
and operation._remote_oid is not None
and operation._remote_oid == operation._local_oid
):
# File already exists on the Hub and has not changed: we can skip it.
logger.debug(f"Skipping upload for '{operation.path_in_repo}' as the file has not changed.")
continue
operations_without_no_op.append(operation)
if len(operations) != len(operations_without_no_op):
logger.info(
f"Removing {len(operations) - len(operations_without_no_op)} file(s) from commit that have not changed."
)

# Return early if empty commit
if len(operations_without_no_op) == 0:
logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.")

# Get latest commit info
try:
info = self.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
except RepositoryNotFoundError as e:
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
raise

# Return commit info based on latest commit
url_prefix = self.endpoint
if repo_type is not None and repo_type != REPO_TYPE_MODEL:
url_prefix = f"{url_prefix}/{repo_type}s"
return CommitInfo(
commit_url=f"{url_prefix}/{repo_id}/commit/{info.sha}",
commit_message=commit_message,
commit_description=commit_description,
oid=info.sha, # type: ignore[arg-type]
)

files_to_copy = _fetch_files_to_copy(
copies=copies,
repo_type=repo_type,
Expand Down
37 changes: 36 additions & 1 deletion src/huggingface_hub/utils/sha.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import BinaryIO, Optional

from .insecure_hashlib import sha256
from .insecure_hashlib import sha1, sha256


def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
Expand All @@ -27,3 +27,38 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
if not chunk:
break
return sha.digest()


def git_hash(data: bytes) -> str:
"""
Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
for more details.
Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
the LFS file content when we want to compare LFS files.
Args:
data (`bytes`):
The data to compute the git-hash for.
Returns:
`str`: the git-hash of `data` as an hexadecimal string.
Example:
```python
>>> from huggingface_hub.utils.sha import git_hash
>>> git_hash(b"Hello, World!")
'b45ef6fec89518d314f546fd6c3025367b721684'
```
"""
# Taken from https://gist.github.com/msabramo/763200
# Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
sha = sha1()
sha.update(b"blob ")
sha.update(str(len(data)).encode())
sha.update(b"\0")
sha.update(data)
return sha.hexdigest()
Loading

0 comments on commit e370fa6

Please sign in to comment.