Prevent empty commits if files did not change (#2389)

* Prevent empty commits * remove warnings in tests * no newline in echo test * fix windows? * style * Apply suggestions from code review Co-authored-by: Julien Chaumond <julien@huggingface.co> --------- Co-authored-by: Julien Chaumond <julien@huggingface.co>
huggingface · Jul 16, 2024 · e370fa6 · e370fa6
1 parent 05fdb76
commit e370fa6
Show file tree

Hide file tree

Showing 5 changed files with 301 additions and 59 deletions.
diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py
@@ -25,6 +25,7 @@
     get_session,
     hf_raise_for_status,
     logging,
+    sha,
     tqdm_stream_file,
     validate_hf_hub_args,
 )
@@ -146,6 +147,10 @@ class CommitOperationAdd:
     # (server-side check)
     _should_ignore: Optional[bool] = field(init=False, repr=False, default=None)
 
+    # set to the remote OID of the file if it has already been uploaded
+    # useful to determine if a commit will be empty or not
+    _remote_oid: Optional[str] = field(init=False, repr=False, default=None)
+
     # set to True once the file has been uploaded as LFS
     _is_uploaded: bool = field(init=False, repr=False, default=False)
 
@@ -246,6 +251,29 @@ def b64content(self) -> bytes:
         with self.as_file() as file:
             return base64.b64encode(file.read())
 
+    @property
+    def _local_oid(self) -> Optional[str]:
+        """Return the OID of the local file.
+
+        This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one.
+        If the file did not change, we won't upload it again to prevent empty commits.
+
+        For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref).
+        For regular files, the OID corresponds to the SHA1 of the file content.
+        Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the
+              pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes
+              and more convenient client-side.
+        """
+        if self._upload_mode is None:
+            return None
+        elif self._upload_mode == "lfs":
+            return self.upload_info.sha256.hex()
+        else:
+            # Regular file => compute sha1
+            # => no need to read by chunk since the file is guaranteed to be <=5MB.
+            with self.as_file() as file:
+                return sha.git_hash(file.read())
+
 
 def _validate_path_in_repo(path_in_repo: str) -> str:
     # Validate `path_in_repo` value to prevent a server-side issue
@@ -483,6 +511,7 @@ def _fetch_upload_modes(
     # Fetch upload mode (LFS or regular) chunk by chunk.
     upload_modes: Dict[str, UploadMode] = {}
     should_ignore_info: Dict[str, bool] = {}
+    oid_info: Dict[str, Optional[str]] = {}
 
     for chunk in chunk_iterable(additions, 256):
         payload: Dict = {
@@ -491,7 +520,6 @@ def _fetch_upload_modes(
                     "path": op.path_in_repo,
                     "sample": base64.b64encode(op.upload_info.sample).decode("ascii"),
                     "size": op.upload_info.size,
-                    "sha": op.upload_info.sha256.hex(),
                 }
                 for op in chunk
             ]
@@ -509,11 +537,13 @@ def _fetch_upload_modes(
         preupload_info = _validate_preupload_info(resp.json())
         upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
         should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})
+        oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]})
 
     # Set upload mode for each addition operation
     for addition in additions:
         addition._upload_mode = upload_modes[addition.path_in_repo]
         addition._should_ignore = should_ignore_info[addition.path_in_repo]
+        addition._remote_oid = oid_info[addition.path_in_repo]
 
     # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
     # => empty files are uploaded as "regular" to still allow users to commit them.

diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -3779,6 +3779,46 @@ def create_commit(
             num_threads=num_threads,
             free_memory=False,  # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users
         )
+
+        # Remove no-op operations (files that have not changed)
+        operations_without_no_op = []
+        for operation in operations:
+            if (
+                isinstance(operation, CommitOperationAdd)
+                and operation._remote_oid is not None
+                and operation._remote_oid == operation._local_oid
+            ):
+                # File already exists on the Hub and has not changed: we can skip it.
+                logger.debug(f"Skipping upload for '{operation.path_in_repo}' as the file has not changed.")
+                continue
+            operations_without_no_op.append(operation)
+        if len(operations) != len(operations_without_no_op):
+            logger.info(
+                f"Removing {len(operations) - len(operations_without_no_op)} file(s) from commit that have not changed."
+            )
+
+        # Return early if empty commit
+        if len(operations_without_no_op) == 0:
+            logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.")
+
+            # Get latest commit info
+            try:
+                info = self.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
+            except RepositoryNotFoundError as e:
+                e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
+                raise
+
+            # Return commit info based on latest commit
+            url_prefix = self.endpoint
+            if repo_type is not None and repo_type != REPO_TYPE_MODEL:
+                url_prefix = f"{url_prefix}/{repo_type}s"
+            return CommitInfo(
+                commit_url=f"{url_prefix}/{repo_id}/commit/{info.sha}",
+                commit_message=commit_message,
+                commit_description=commit_description,
+                oid=info.sha,  # type: ignore[arg-type]
+            )
+
         files_to_copy = _fetch_files_to_copy(
             copies=copies,
             repo_type=repo_type,

diff --git a/src/huggingface_hub/utils/sha.py b/src/huggingface_hub/utils/sha.py
@@ -2,7 +2,7 @@
 
 from typing import BinaryIO, Optional
 
-from .insecure_hashlib import sha256
+from .insecure_hashlib import sha1, sha256
 
 
 def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
@@ -27,3 +27,38 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
         if not chunk:
             break
     return sha.digest()
+
+
+def git_hash(data: bytes) -> str:
+    """
+    Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
+
+    This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
+    for more details.
+
+    Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
+          pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
+          the LFS file content when we want to compare LFS files.
+
+    Args:
+        data (`bytes`):
+            The data to compute the git-hash for.
+
+    Returns:
+        `str`: the git-hash of `data` as an hexadecimal string.
+
+    Example:
+    ```python
+    >>> from huggingface_hub.utils.sha import git_hash
+    >>> git_hash(b"Hello, World!")
+    'b45ef6fec89518d314f546fd6c3025367b721684'
+    ```
+    """
+    # Taken from https://gist.github.com/msabramo/763200
+    # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
+    sha = sha1()
+    sha.update(b"blob ")
+    sha.update(str(len(data)).encode())
+    sha.update(b"\0")
+    sha.update(data)
+    return sha.hexdigest()