From c849c151b558ef34ac27585aae8668e64543a905 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Jul 2024 10:34:12 +0200 Subject: [PATCH 1/6] Prevent empty commits --- src/huggingface_hub/_commit_api.py | 32 ++++- src/huggingface_hub/hf_api.py | 40 +++++++ src/huggingface_hub/utils/sha.py | 37 +++++- tests/test_hf_api.py | 184 ++++++++++++++++++++++++----- tests/test_utils_sha.py | 59 +++++---- 5 files changed, 296 insertions(+), 56 deletions(-) diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index 4c91c3965f..f005474c49 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -25,6 +25,7 @@ get_session, hf_raise_for_status, logging, + sha, tqdm_stream_file, validate_hf_hub_args, ) @@ -146,6 +147,10 @@ class CommitOperationAdd: # (server-side check) _should_ignore: Optional[bool] = field(init=False, repr=False, default=None) + # set to the remote OID of the file if it has already been uploaded + # useful to determine if a commit will be empty or not + _remote_oid: Optional[str] = field(init=False, repr=False, default=None) + # set to True once the file has been uploaded as LFS _is_uploaded: bool = field(init=False, repr=False, default=False) @@ -246,6 +251,29 @@ def b64content(self) -> bytes: with self.as_file() as file: return base64.b64encode(file.read()) + @property + def _local_oid(self) -> Optional[str]: + """Return the OID of the local file. + + This OID is then compared to `self._remote_oid` to check if the file has changed compared to the remote one. + If the file did not change, we won't upload it again to prevent empty commits. + + For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref). + For regular files, the OID corresponds to the SHA1 of the file content. + Note: this is slightly different to git OID computation since the oid of an LFS file is usually the SHA1 of the + pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes + and more convenient client-side. + """ + if self._upload_mode is None: + return None + elif self._upload_mode == "lfs": + return self.upload_info.sha256.hex() + else: + # Regular file => compute sha1 + # => no need to read by chunk since the file is guaranteed to be <=5MB. + with self.as_file() as file: + return sha.git_hash(file.read()) + def _validate_path_in_repo(path_in_repo: str) -> str: # Validate `path_in_repo` value to prevent a server-side issue @@ -483,6 +511,7 @@ def _fetch_upload_modes( # Fetch upload mode (LFS or regular) chunk by chunk. upload_modes: Dict[str, UploadMode] = {} should_ignore_info: Dict[str, bool] = {} + oid_info: Dict[str, Optional[str]] = {} for chunk in chunk_iterable(additions, 256): payload: Dict = { @@ -491,7 +520,6 @@ def _fetch_upload_modes( "path": op.path_in_repo, "sample": base64.b64encode(op.upload_info.sample).decode("ascii"), "size": op.upload_info.size, - "sha": op.upload_info.sha256.hex(), } for op in chunk ] @@ -509,11 +537,13 @@ def _fetch_upload_modes( preupload_info = _validate_preupload_info(resp.json()) upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]}) should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]}) + oid_info.update(**{file["path"]: file.get("oid") for file in preupload_info["files"]}) # Set upload mode for each addition operation for addition in additions: addition._upload_mode = upload_modes[addition.path_in_repo] addition._should_ignore = should_ignore_info[addition.path_in_repo] + addition._remote_oid = oid_info[addition.path_in_repo] # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented) # => empty files are uploaded as "regular" to still allow users to commit them. diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index eddaf9130f..bb44e8b8a3 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -3624,6 +3624,46 @@ def create_commit( num_threads=num_threads, free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users ) + + # Remove no-op operations (files that have not changed) + operations_without_no_op = [] + for operation in operations: + if ( + isinstance(operation, CommitOperationAdd) + and operation._remote_oid is not None + and operation._remote_oid == operation._local_oid + ): + # File already exists on the Hub and has not changed: we can skip it. + logger.debug(f"Skipping upload for '{operation.path_in_repo}' as the file has not changed.") + continue + operations_without_no_op.append(operation) + if len(operations) != len(operations_without_no_op): + logger.info( + f"Removing {len(operations) - len(operations_without_no_op)} file(s) from commit that have not changed." + ) + + # Return early if empty commit + if len(operations_without_no_op) == 0: + logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.") + + # Get latest commit info + try: + info = self.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token) + except RepositoryNotFoundError as e: + e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE) + raise + + # Return commit info based on latest commit + url_prefix = self.endpoint + if repo_type is not None and repo_type != REPO_TYPE_MODEL: + url_prefix = f"{url_prefix}/{repo_type}s" + return CommitInfo( + commit_url=f"{url_prefix}/{repo_id}/commit/{info.sha}", + commit_message=commit_message, + commit_description=commit_description, + oid=info.sha, # type: ignore[arg-type] + ) + files_to_copy = _fetch_files_to_copy( copies=copies, repo_type=repo_type, diff --git a/src/huggingface_hub/utils/sha.py b/src/huggingface_hub/utils/sha.py index 233ab074e6..cd3055c416 100644 --- a/src/huggingface_hub/utils/sha.py +++ b/src/huggingface_hub/utils/sha.py @@ -2,7 +2,7 @@ from typing import BinaryIO, Optional -from .insecure_hashlib import sha256 +from .insecure_hashlib import sha1, sha256 def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: @@ -27,3 +27,38 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: if not chunk: break return sha.digest() + + +def git_hash(data: bytes) -> str: + """ + Computes the sha1 hash of the given bytes, using the same algorithm as git. + + This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object + for more details. + + Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the + pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of + the LFS file content when we want to compare LFS files. + + Args: + data (`bytes`): + The data to compute the git-hash for. + + Returns: + `str`: the git-hash of `data` as an hexadecimal string. + + Example: + ```python + >>> from huggingface_hub.utils.sha import git_hash + >>> git_hash(b"Hello, World!") + 'b45ef6fec89518d314f546fd6c3025367b721684' + ``` + """ + # Taken from https://gist.github.com/msabramo/763200 + # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum). + sha = sha1() + sha.update(b"blob ") + sha.update(str(len(data)).encode()) + sha.update(b"\0") + sha.update(data) + return sha.hexdigest() diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index 9a01362caa..aad4798128 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -596,7 +596,9 @@ def test_create_commit_create_pr_against_branch(self): # Create PR against non-main branch works resp = self._api.create_commit( - operations=[], + operations=[ + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + ], commit_message="PR against existing branch", repo_id=repo_id, revision="test_branch", @@ -607,7 +609,9 @@ def test_create_commit_create_pr_against_branch(self): # Create PR against a oid fails with self.assertRaises(RevisionNotFoundError): self._api.create_commit( - operations=[], + operations=[ + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + ], commit_message="PR against a oid", repo_id=repo_id, revision=head, @@ -617,7 +621,9 @@ def test_create_commit_create_pr_against_branch(self): # Create PR against a non-existing branch fails with self.assertRaises(RevisionNotFoundError): self._api.create_commit( - operations=[], + operations=[ + CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + ], commit_message="PR against missing branch", repo_id=repo_id, revision="missing_branch", @@ -715,32 +721,26 @@ def test_create_commit_conflict(self, repo_url: RepoUrl) -> None: def test_create_commit_repo_does_not_exist(self) -> None: """Test error message is detailed when creating a commit on a missing repo.""" - # Test once with empty commit and once with an addition commit. - for route, operations in ( - ("commit", []), - ("preupload", [CommitOperationAdd("config.json", b"content")]), - ): - with self.subTest(): - with self.assertRaises(RepositoryNotFoundError) as context: - self._api.create_commit( - repo_id=f"{USER}/repo_that_do_not_exist", - operations=operations, - commit_message="fake_message", - ) - - request_id = context.exception.response.headers.get("X-Request-Id") - expected_message = ( - f"404 Client Error. (Request ID: {request_id})\n\nRepository Not" - " Found for url:" - f" {self._api.endpoint}/api/models/{USER}/repo_that_do_not_exist/{route}/main.\nPlease" - " make sure you specified the correct `repo_id` and" - " `repo_type`.\nIf you are trying to access a private or gated" - " repo, make sure you are authenticated.\nNote: Creating a commit" - " assumes that the repo already exists on the Huggingface Hub." - " Please use `create_repo` if it's not the case." - ) + with self.assertRaises(RepositoryNotFoundError) as context: + self._api.create_commit( + repo_id=f"{USER}/repo_that_do_not_exist", + operations=[CommitOperationAdd("config.json", b"content")], + commit_message="fake_message", + ) + + request_id = context.exception.response.headers.get("X-Request-Id") + expected_message = ( + f"404 Client Error. (Request ID: {request_id})\n\nRepository Not" + " Found for url:" + f" {self._api.endpoint}/api/models/{USER}/repo_that_do_not_exist/preupload/main.\nPlease" + " make sure you specified the correct `repo_id` and" + " `repo_type`.\nIf you are trying to access a private or gated" + " repo, make sure you are authenticated.\nNote: Creating a commit" + " assumes that the repo already exists on the Huggingface Hub." + " Please use `create_repo` if it's not the case." + ) - self.assertEqual(str(context.exception), expected_message) + assert str(context.exception) == expected_message @patch("huggingface_hub.utils._headers.get_token", return_value=TOKEN) def test_create_commit_lfs_file_implicit_token(self, get_token_mock: Mock) -> None: @@ -1008,6 +1008,134 @@ def test_create_file_with_relative_path(self): self._api.upload_file(path_or_fileobj=b"content", path_in_repo="..\\ddd", repo_id=repo_id) assert cm.exception.response.status_code == 422 + @use_tmp_repo() + def test_prevent_empty_commit_if_no_op(self, repo_url: RepoUrl) -> None: + with self.assertLogs("huggingface_hub", level="INFO") as logs: + self._api.create_commit(repo_id=repo_url.repo_id, commit_message="Empty commit", operations=[]) + assert ( + logs.records[0].message + == "No files have been modified since last commit. Skipping to prevent empty commit." + ) + assert logs.records[0].levelname == "WARNING" + + @use_tmp_repo() + def test_prevent_empty_commit_if_no_new_addition(self, repo_url: RepoUrl) -> None: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="initial commit", + operations=[ + CommitOperationAdd(path_or_fileobj=b"Regular file content", path_in_repo="file.txt"), + CommitOperationAdd(path_or_fileobj=b"LFS content", path_in_repo="lfs.bin"), + ], + ) + with self.assertLogs("huggingface_hub", level="INFO") as logs: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="Empty commit", + operations=[ + CommitOperationAdd(path_or_fileobj=b"Regular file content", path_in_repo="file.txt"), + CommitOperationAdd(path_or_fileobj=b"LFS content", path_in_repo="lfs.bin"), + ], + ) + assert logs.records[0].message == "Removing 2 file(s) from commit that have not changed." + assert logs.records[0].levelname == "INFO" + + assert ( + logs.records[1].message + == "No files have been modified since last commit. Skipping to prevent empty commit." + ) + assert logs.records[1].levelname == "WARNING" + + @use_tmp_repo() + def test_continue_commit_without_existing_files(self, repo_url: RepoUrl) -> None: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="initial commit", + operations=[ + CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"), + CommitOperationAdd(path_or_fileobj=b"content 2.0", path_in_repo="file2.txt"), + CommitOperationAdd(path_or_fileobj=b"LFS content 1.0", path_in_repo="lfs.bin"), + CommitOperationAdd(path_or_fileobj=b"LFS content 2.0", path_in_repo="lfs2.bin"), + ], + ) + with self.assertLogs("huggingface_hub", level="DEBUG") as logs: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="second commit", + operations=[ + # Did not change => will be removed from commit + CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"), + # Change => will be kept + CommitOperationAdd(path_or_fileobj=b"content 2.1", path_in_repo="file2.txt"), + # New file => will be kept + CommitOperationAdd(path_or_fileobj=b"content 3.0", path_in_repo="file3.txt"), + # Did not change => will be removed from commit + CommitOperationAdd(path_or_fileobj=b"LFS content 1.0", path_in_repo="lfs.bin"), + # Change => will be kept + CommitOperationAdd(path_or_fileobj=b"LFS content 2.1", path_in_repo="lfs2.bin"), + # New file => will be kept + CommitOperationAdd(path_or_fileobj=b"LFS content 3.0", path_in_repo="lfs3.bin"), + ], + ) + debug_logs = [log.message for log in logs.records if log.levelname == "DEBUG"] + info_logs = [log.message for log in logs.records if log.levelname == "INFO"] + warning_logs = [log.message for log in logs.records if log.levelname == "WARNING"] + + assert "Skipping upload for 'file.txt' as the file has not changed." in debug_logs + assert "Skipping upload for 'lfs.bin' as the file has not changed." in debug_logs + assert "Removing 2 file(s) from commit that have not changed." in info_logs + assert len(warning_logs) == 0 # no warnings since the commit is not empty + + paths_info = { + item.path: item.last_commit + for item in self._api.get_paths_info( + repo_id=repo_url.repo_id, + paths=["file.txt", "file2.txt", "file3.txt", "lfs.bin", "lfs2.bin", "lfs3.bin"], + expand=True, + ) + } + + # Check which files are in the last commit + assert paths_info["file.txt"].title == "initial commit" + assert paths_info["file2.txt"].title == "second commit" + assert paths_info["file3.txt"].title == "second commit" + assert paths_info["lfs.bin"].title == "initial commit" + assert paths_info["lfs2.bin"].title == "second commit" + assert paths_info["lfs3.bin"].title == "second commit" + + @use_tmp_repo() + def test_continue_commit_if_only_deletion(self, repo_url: RepoUrl) -> None: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="initial commit", + operations=[ + CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"), + CommitOperationAdd(path_or_fileobj=b"content 2.0", path_in_repo="file2.txt"), + ], + ) + with self.assertLogs("huggingface_hub", level="DEBUG") as logs: + self._api.create_commit( + repo_id=repo_url.repo_id, + commit_message="second commit", + operations=[ + # Did not change => will be removed from commit + CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"), + # Delete operation => kept in any case + CommitOperationDelete(path_in_repo="file2.txt"), + ], + ) + debug_logs = [log.message for log in logs.records if log.levelname == "DEBUG"] + info_logs = [log.message for log in logs.records if log.levelname == "INFO"] + warning_logs = [log.message for log in logs.records if log.levelname == "WARNING"] + + assert "Skipping upload for 'file.txt' as the file has not changed." in debug_logs + assert "Removing 1 file(s) from commit that have not changed." in info_logs + assert len(warning_logs) == 0 # no warnings since the commit is not empty + + remote_files = self._api.list_repo_files(repo_id=repo_url.repo_id) + assert "file.txt" in remote_files + assert "file2.txt" not in remote_files + class HfApiUploadEmptyFileTest(HfApiCommonTest): @classmethod diff --git a/tests/test_utils_sha.py b/tests/test_utils_sha.py index 04a7ae98dc..f51053ff99 100644 --- a/tests/test_utils_sha.py +++ b/tests/test_utils_sha.py @@ -1,31 +1,38 @@ import os -import unittest +import subprocess from hashlib import sha256 from io import BytesIO from huggingface_hub.utils import SoftTemporaryDirectory -from huggingface_hub.utils.sha import sha_fileobj - - -class TestShaUtils(unittest.TestCase): - def test_sha_fileobj(self): - with SoftTemporaryDirectory() as tmpdir: - content = b"Random content" * 1000 - sha = sha256(content).digest() - - # Test with file object - filepath = os.path.join(tmpdir, "file.bin") - with open(filepath, "wb+") as file: - file.write(content) - - with open(filepath, "rb") as fileobj: - self.assertEqual(sha_fileobj(fileobj, None), sha) - with open(filepath, "rb") as fileobj: - self.assertEqual(sha_fileobj(fileobj, 50), sha) - with open(filepath, "rb") as fileobj: - self.assertEqual(sha_fileobj(fileobj, 50_000), sha) - - # Test with in-memory file object - self.assertEqual(sha_fileobj(BytesIO(content), None), sha) - self.assertEqual(sha_fileobj(BytesIO(content), 50), sha) - self.assertEqual(sha_fileobj(BytesIO(content), 50_000), sha) +from huggingface_hub.utils.sha import git_hash, sha_fileobj + + +def test_sha_fileobj(): + with SoftTemporaryDirectory() as tmpdir: + content = b"Random content" * 1000 + sha = sha256(content).digest() + + # Test with file object + filepath = os.path.join(tmpdir, "file.bin") + with open(filepath, "wb+") as file: + file.write(content) + + with open(filepath, "rb") as fileobj: + assert sha_fileobj(fileobj, None) == sha + with open(filepath, "rb") as fileobj: + assert sha_fileobj(fileobj, 50) == sha + with open(filepath, "rb") as fileobj: + assert sha_fileobj(fileobj, 50_000) == sha + + # Test with in-memory file object + assert sha_fileobj(BytesIO(content), None) == sha + assert sha_fileobj(BytesIO(content), 50) == sha + assert sha_fileobj(BytesIO(content), 50_000) == sha + + +def test_git_hash(): + """Test the `git_hash` output is the same as `git hash-object` command.""" + output = subprocess.run( + "echo 'Hello, World!' | git hash-object -w --stdin", shell=True, capture_output=True, text=True + ) + assert output.stdout.strip() == git_hash(b"Hello, World!\n") From f4653e131429eaec7706505af88c7494dc7bdbb6 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Jul 2024 10:50:23 +0200 Subject: [PATCH 2/6] remove warnings in tests --- tests/test_hf_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index aad4798128..2e4135092a 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -343,8 +343,8 @@ def test_upload_file_str_path(self, repo_url: RepoUrl) -> None: @use_tmp_repo() def test_upload_file_pathlib_path(self, repo_url: RepoUrl) -> None: """Regression test for https://github.com/huggingface/huggingface_hub/issues/1246.""" - self._api.upload_file(path_or_fileobj=Path(self.tmp_file), path_in_repo="README.md", repo_id=repo_url.repo_id) - self.assertIn("README.md", self._api.list_repo_files(repo_id=repo_url.repo_id)) + self._api.upload_file(path_or_fileobj=Path(self.tmp_file), path_in_repo="file.txt", repo_id=repo_url.repo_id) + self.assertIn("file.txt", self._api.list_repo_files(repo_id=repo_url.repo_id)) @use_tmp_repo() def test_upload_file_fileobj(self, repo_url: RepoUrl) -> None: @@ -939,7 +939,7 @@ def test_pre_upload_before_commit(self, repo_url: RepoUrl) -> None: CommitOperationAdd(path_in_repo="lfs.bin", path_or_fileobj=b"content1"), CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), CommitOperationAdd(path_in_repo="lfs2.bin", path_or_fileobj=b"content2"), - CommitOperationAdd(path_in_repo="file.txt", path_or_fileobj=b"content"), + CommitOperationAdd(path_in_repo="file2.txt", path_or_fileobj=b"content"), ] # First: preupload 1 by 1 From 638d568578f5ecec3adf3518e4ab0a1601c6b994 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Jul 2024 11:23:07 +0200 Subject: [PATCH 3/6] no newline in echo test --- tests/test_utils_sha.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_utils_sha.py b/tests/test_utils_sha.py index f51053ff99..1847ee8668 100644 --- a/tests/test_utils_sha.py +++ b/tests/test_utils_sha.py @@ -33,6 +33,6 @@ def test_sha_fileobj(): def test_git_hash(): """Test the `git_hash` output is the same as `git hash-object` command.""" output = subprocess.run( - "echo 'Hello, World!' | git hash-object -w --stdin", shell=True, capture_output=True, text=True + "echo -n 'Hello, World!' | git hash-object -w --stdin", shell=True, capture_output=True, text=True ) - assert output.stdout.strip() == git_hash(b"Hello, World!\n") + assert output.stdout.strip() == git_hash(b"Hello, World!") From 8db64ccfe69685045cc2cf131680fbf69e844f35 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Jul 2024 11:37:58 +0200 Subject: [PATCH 4/6] fix windows? --- tests/test_utils_sha.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_utils_sha.py b/tests/test_utils_sha.py index 1847ee8668..5dae5bbc3e 100644 --- a/tests/test_utils_sha.py +++ b/tests/test_utils_sha.py @@ -30,9 +30,13 @@ def test_sha_fileobj(): assert sha_fileobj(BytesIO(content), 50_000) == sha -def test_git_hash(): +def test_git_hash(tmpdir): """Test the `git_hash` output is the same as `git hash-object` command.""" + path = os.path.join(tmpdir, "file.txt") + with open(path, "wb") as file: + file.write(b"Hello, World!") + output = subprocess.run( - "echo -n 'Hello, World!' | git hash-object -w --stdin", shell=True, capture_output=True, text=True + f"git hash-object -t blob {path}", shell=True, capture_output=True, text=True ) assert output.stdout.strip() == git_hash(b"Hello, World!") From d17dc63634c5b77cfffb862b68842ccc94c3b0c6 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 12 Jul 2024 12:20:35 +0200 Subject: [PATCH 5/6] style --- tests/test_utils_sha.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_utils_sha.py b/tests/test_utils_sha.py index 5dae5bbc3e..c4551a3f04 100644 --- a/tests/test_utils_sha.py +++ b/tests/test_utils_sha.py @@ -36,7 +36,5 @@ def test_git_hash(tmpdir): with open(path, "wb") as file: file.write(b"Hello, World!") - output = subprocess.run( - f"git hash-object -t blob {path}", shell=True, capture_output=True, text=True - ) + output = subprocess.run(f"git hash-object -t blob {path}", shell=True, capture_output=True, text=True) assert output.stdout.strip() == git_hash(b"Hello, World!") From db05a9883b59f7dfddf44e09b06d0ee000715497 Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 12 Jul 2024 13:18:43 +0200 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Julien Chaumond --- src/huggingface_hub/_commit_api.py | 2 +- src/huggingface_hub/utils/sha.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index f005474c49..ba0cfce238 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -260,7 +260,7 @@ def _local_oid(self) -> Optional[str]: For LFS files, the OID corresponds to the SHA256 of the file content (used a LFS ref). For regular files, the OID corresponds to the SHA1 of the file content. - Note: this is slightly different to git OID computation since the oid of an LFS file is usually the SHA1 of the + Note: this is slightly different to git OID computation since the oid of an LFS file is usually the git-SHA1 of the pointer file content (not the actual file content). However, using the SHA256 is enough to detect changes and more convenient client-side. """ diff --git a/src/huggingface_hub/utils/sha.py b/src/huggingface_hub/utils/sha.py index cd3055c416..001c3fe8b2 100644 --- a/src/huggingface_hub/utils/sha.py +++ b/src/huggingface_hub/utils/sha.py @@ -31,7 +31,7 @@ def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: def git_hash(data: bytes) -> str: """ - Computes the sha1 hash of the given bytes, using the same algorithm as git. + Computes the git-sha1 hash of the given bytes, using the same algorithm as git. This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object for more details.