From ec778cede6a21e010c99083c5fda94335967a3ed Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 27 Nov 2023 16:54:52 +0100 Subject: [PATCH 1/4] Respect .gitignore file in commits --- docs/source/en/guides/upload.md | 13 ++++--- src/huggingface_hub/_commit_api.py | 47 ++++++++++++++++++----- src/huggingface_hub/hf_api.py | 60 +++++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 16 deletions(-) diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md index 81ee761ff8..3dce73e634 100644 --- a/docs/source/en/guides/upload.md +++ b/docs/source/en/guides/upload.md @@ -73,12 +73,15 @@ folder to. Depending on your repository type, you can optionally set the reposit ... ) ``` -Use the `allow_patterns` and `ignore_patterns` arguments to specify which files to upload. These parameters accept either a single pattern or a list of patterns. -Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). -If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded. +By default, the `.gitignore` file will be taken into account to know which files should +be committed or not. By default we check if a `.gitignore` file is present in a commit, and if not, we check if it exists on the Hub. If you want to force the upload no matter +the `.gitignore` file, you can pass `respect_gitignore=False`. Please be aware that only +a `.gitignore` file present at the root of the directory with be used. We do not check +for `.gitignore` files in subdirectories. -Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` file is not taken into account. -This means you must use `allow_patterns` and `ignore_patterns` to specify which files to upload instead. +If you don't want to use an hardcoded `.gitignore` file, you can use the `allow_patterns` and `ignore_patterns` arguments to filter which files to upload. These parameters accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. + +Beside the `.gitignore` file and allow/ignore patterns, any `.git/` folder present in any subdirectory will be ignored. ```py >>> api.upload_folder( diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index 674075642e..2265df7e1d 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -137,13 +137,19 @@ class CommitOperationAdd: upload_info: UploadInfo = field(init=False, repr=False) # Internal attributes - _upload_mode: Optional[UploadMode] = field( - init=False, repr=False, default=None - ) # set to "lfs" or "regular" once known - _is_uploaded: bool = field( - init=False, repr=False, default=False - ) # set to True once the file has been uploaded as LFS - _is_committed: bool = field(init=False, repr=False, default=False) # set to True once the file has been committed + + # set to "lfs" or "regular" once known + _upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None) + + # set to True if .gitignore rules prevent the file from being uploaded as LFS + # (server-side check) + _should_ignore: Optional[bool] = field(init=False, repr=False, default=None) + + # set to True once the file has been uploaded as LFS + _is_uploaded: bool = field(init=False, repr=False, default=False) + + # set to True once the file has been committed + _is_committed: bool = field(init=False, repr=False, default=False) def __post_init__(self) -> None: """Validates `path_or_fileobj` and compute `upload_info`.""" @@ -439,6 +445,7 @@ def _fetch_upload_modes( revision: str, endpoint: Optional[str] = None, create_pr: bool = False, + gitignore_content: Optional[str] = None, ) -> None: """ Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob @@ -457,7 +464,11 @@ def _fetch_upload_modes( An authentication token ( See https://huggingface.co/settings/tokens ) revision (`str`): The git revision to upload the files to. Can be any valid git revision. - + gitignore_content (`str`, *optional*): + The content of the `.gitignore` file to know which files should be ignored. The order of priority + is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present + in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub + (if any). Raises: [`~utils.HfHubHTTPError`] If the Hub API returned an error. @@ -469,8 +480,10 @@ def _fetch_upload_modes( # Fetch upload mode (LFS or regular) chunk by chunk. upload_modes: Dict[str, UploadMode] = {} + should_ignore_info: Dict[str, bool] = {} + for chunk in chunk_iterable(additions, 256): - payload = { + payload: Dict = { "files": [ { "path": op.path_in_repo, @@ -481,6 +494,8 @@ def _fetch_upload_modes( for op in chunk ] } + if gitignore_content is not None: + payload["gitIgnore"] = gitignore_content resp = get_session().post( f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}", @@ -491,10 +506,12 @@ def _fetch_upload_modes( hf_raise_for_status(resp) preupload_info = _validate_preupload_info(resp.json()) upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]}) + should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]}) # Set upload mode for each addition operation for addition in additions: addition._upload_mode = upload_modes[addition.path_in_repo] + addition._should_ignore = should_ignore_info[addition.path_in_repo] # Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented) # => empty files are uploaded as "regular" to still allow users to commit them. @@ -571,6 +588,7 @@ def _prepare_commit_payload( commit_message: str, commit_description: Optional[str] = None, parent_commit: Optional[str] = None, + respect_gitignore: bool = True, ) -> Iterable[Dict[str, Any]]: """ Builds the payload to POST to the `/commit` API of the Hub. @@ -590,8 +608,16 @@ def _prepare_commit_payload( header_value["parentCommit"] = parent_commit yield {"key": "header", "value": header_value} + nb_ignored_files = 0 + # 2. Send operations, one per line for operation in operations: + # Skip ignored files + if respect_gitignore and isinstance(operation, CommitOperationAdd) and operation._should_ignore: + logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).") + nb_ignored_files += 1 + continue + # 2.a. Case adding a regular file if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular": yield { @@ -638,3 +664,6 @@ def _prepare_commit_payload( f"Unknown operation to commit. Operation: {operation}. Upload mode:" f" {getattr(operation, '_upload_mode', None)}" ) + + if nb_ignored_files > 0: + logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).") diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 69bf7f72eb..4385f02d0c 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -3332,6 +3332,7 @@ def create_commit( # type: ignore create_pr: Optional[bool] = None, num_threads: int = 5, parent_commit: Optional[str] = None, + respect_gitignore: bool = True, run_as_future: Literal[False] = ..., ) -> CommitInfo: ... @@ -3350,6 +3351,7 @@ def create_commit( create_pr: Optional[bool] = None, num_threads: int = 5, parent_commit: Optional[str] = None, + respect_gitignore: bool = True, run_as_future: Literal[True] = ..., ) -> Future[CommitInfo]: ... @@ -3369,6 +3371,7 @@ def create_commit( create_pr: Optional[bool] = None, num_threads: int = 5, parent_commit: Optional[str] = None, + respect_gitignore: bool = True, run_as_future: bool = False, ) -> Union[CommitInfo, Future[CommitInfo]]: """ @@ -3447,6 +3450,8 @@ def create_commit( is `True`, the pull request will be created from `parent_commit`. Specifying `parent_commit` ensures the repo has not changed before committing the changes, and can be especially useful if the repo is updated / committed to concurrently. + respect_gitignore (`bool`, *optional*): + Whether to respect the `.gitignore` file in the repo. Defaults to `True`. run_as_future (`bool`, *optional*): Whether or not to run this method in the background. Background jobs are run sequentially without blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) @@ -3518,6 +3523,7 @@ def create_commit( create_pr=create_pr, num_threads=num_threads, free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users + respect_gitignore=respect_gitignore, ) files_to_copy = _fetch_lfs_files_to_copy( copies=copies, @@ -3533,6 +3539,7 @@ def create_commit( commit_message=commit_message, commit_description=commit_description, parent_commit=parent_commit, + respect_gitignore=respect_gitignore, ) commit_url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/commit/{revision}" @@ -3591,6 +3598,7 @@ def create_commits_on_pr( merge_pr: bool = True, num_threads: int = 5, # TODO: use to multithread uploads verbose: bool = False, + respect_gitignore: bool = True, ) -> str: """Push changes to the Hub in multiple commits. @@ -3651,6 +3659,9 @@ def create_commits_on_pr( If set to `True`, process will run on verbose mode i.e. print information about the ongoing tasks. Defaults to `False`. + respect_gitignore (`bool`, *optional*): + Whether to respect the `.gitignore` file in the repo. Defaults to `True`. + Returns: `str`: URL to the created PR. @@ -3799,6 +3810,7 @@ def create_commits_on_pr( num_threads=num_threads, operations=step.operations, create_pr=False, + respect_gitignore=respect_gitignore, ) step.completed = True nb_remaining -= 1 @@ -3887,6 +3899,8 @@ def preupload_lfs_files( create_pr: Optional[bool] = None, num_threads: int = 5, free_memory: bool = True, + respect_gitignore: bool = True, + gitignore_content: Optional[str] = None, ): """Pre-upload LFS files to S3 in preparation on a future commit. @@ -3933,6 +3947,15 @@ def preupload_lfs_files( Number of concurrent threads for uploading files. Defaults to 5. Setting it to 2 means at most 2 files will be uploaded concurrently. + respect_gitignore (`bool`, *optional*): + Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`. + + gitignore_content (`str`, *optional*): + The content of the `.gitignore` file to know which files should be ignored. The order of priority + is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present + in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub + (if any). + Example: ```py >>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo @@ -3957,6 +3980,15 @@ def preupload_lfs_files( revision = quote(revision, safe="") if revision is not None else DEFAULT_REVISION create_pr = create_pr if create_pr is not None else False + # Check if a `gitignore` file is being committed to the Hub. + additions = list(additions) + if gitignore_content is None: + for addition in additions: + if addition.path_in_repo == ".gitignore": + with addition.as_file() as f: + gitignore_content = f.read().decode() + break + # Filter out already uploaded files new_additions = [addition for addition in additions if not addition._is_uploaded] @@ -3970,6 +4002,7 @@ def preupload_lfs_files( revision=revision, endpoint=self.endpoint, create_pr=create_pr or False, + gitignore_content=gitignore_content, ) except RepositoryNotFoundError as e: e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE) @@ -3978,9 +4011,25 @@ def preupload_lfs_files( # Filter out regular files new_lfs_additions = [addition for addition in new_additions if addition._upload_mode == "lfs"] + # Filter out files listed in .gitignore + if respect_gitignore: + new_lfs_additions_to_upload = [] + for addition in new_lfs_additions: + if addition._should_ignore: + logger.debug( + f"Skipping upload for LFS file '{addition.path_in_repo}' (ignored by gitignore file)." + ) + else: + new_lfs_additions_to_upload.append(addition) + if len(new_lfs_additions) != len(new_lfs_additions_to_upload): + logger.info( + f"Skipped upload for {len(new_lfs_additions) - len(new_lfs_additions_to_upload)} LFS file(s) " + "(ignored by gitignore file)." + ) + # Upload new LFS files _upload_lfs_files( - additions=new_lfs_additions, + additions=new_lfs_additions_to_upload, repo_type=repo_type, repo_id=repo_id, token=token or self.token, @@ -3991,7 +4040,7 @@ def preupload_lfs_files( # PR (i.e. `revision`). revision=revision if not create_pr else None, ) - for addition in new_lfs_additions: + for addition in new_lfs_additions_to_upload: addition._is_uploaded = True if free_memory: addition.path_or_fileobj = b"" @@ -4207,6 +4256,7 @@ def upload_folder( # type: ignore delete_patterns: Optional[Union[List[str], str]] = None, multi_commits: bool = False, multi_commits_verbose: bool = False, + respect_gitignore: bool = True, run_as_future: Literal[False] = ..., ) -> str: ... @@ -4230,6 +4280,7 @@ def upload_folder( delete_patterns: Optional[Union[List[str], str]] = None, multi_commits: bool = False, multi_commits_verbose: bool = False, + respect_gitignore: bool = True, run_as_future: Literal[True] = ..., ) -> Future[str]: ... @@ -4254,6 +4305,7 @@ def upload_folder( delete_patterns: Optional[Union[List[str], str]] = None, multi_commits: bool = False, multi_commits_verbose: bool = False, + respect_gitignore: bool = True, run_as_future: bool = False, ) -> Union[str, Future[str]]: """ @@ -4326,6 +4378,8 @@ def upload_folder( If True, changes are pushed to a PR using a multi-commit process. Defaults to `False`. multi_commits_verbose (`bool`): If True and `multi_commits` is used, more information will be displayed to the user. + respect_gitignore (`bool`, *optional*): + Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`. run_as_future (`bool`, *optional*): Whether or not to run this method in the background. Background jobs are run sequentially without blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) @@ -4454,6 +4508,7 @@ def upload_folder( token=token, merge_pr=not create_pr, verbose=multi_commits_verbose, + respect_gitignore=respect_gitignore, ) else: commit_info = self.create_commit( @@ -4466,6 +4521,7 @@ def upload_folder( revision=revision, create_pr=create_pr, parent_commit=parent_commit, + respect_gitignore=respect_gitignore, ) pr_url = commit_info.pr_url From 53cd33947a64a9590a37fd20cb999b7747a939ef Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 27 Nov 2023 17:09:39 +0100 Subject: [PATCH 2/4] add tests --- src/huggingface_hub/hf_api.py | 2 ++ tests/test_hf_api.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 4385f02d0c..3d382b57ad 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -4026,6 +4026,8 @@ def preupload_lfs_files( f"Skipped upload for {len(new_lfs_additions) - len(new_lfs_additions_to_upload)} LFS file(s) " "(ignored by gitignore file)." ) + else: + new_lfs_additions_to_upload = new_lfs_additions # Upload new LFS files _upload_lfs_files( diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index eedcf52ed7..0084f6cd5a 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -562,6 +562,39 @@ def _create_file(*parts) -> None: {".gitattributes", ".git_something/file.txt", "file.git", "temp", "nested/file.bin"}, ) + @use_tmp_repo() + def test_upload_folder_gitignore_already_exists(self, repo_url: RepoUrl) -> None: + # Ignore nested folder + self._api.upload_file(path_or_fileobj=b"nested/*\n", path_in_repo=".gitignore", repo_id=repo_url.repo_id) + + # Upload folder + self._api.upload_folder(folder_path=self.tmp_dir, repo_id=repo_url.repo_id) + + # Check nested file not uploaded + assert not self._api.file_exists(repo_url.repo_id, "nested/file.bin") + + @use_tmp_repo() + def test_upload_folder_gitignore_in_commit(self, repo_url: RepoUrl) -> None: + # Create .gitignore file locally + (Path(self.tmp_dir) / ".gitignore").write_text("nested/*\n") + + # Upload folder + self._api.upload_folder(folder_path=self.tmp_dir, repo_id=repo_url.repo_id) + + # Check nested file not uploaded + assert not self._api.file_exists(repo_url.repo_id, "nested/file.bin") + + @use_tmp_repo() + def test_upload_folder_ignore_gitignore(self, repo_url: RepoUrl) -> None: + # Create .gitignore file locally + (Path(self.tmp_dir) / ".gitignore").write_text("nested/*\n") + + # Upload folder with `respect_gitignore=False` + self._api.upload_folder(folder_path=self.tmp_dir, repo_id=repo_url.repo_id, respect_gitignore=False) + + # Check nested file is uploaded + assert self._api.file_exists(repo_url.repo_id, "nested/file.bin") + def test_create_commit_create_pr(self): REPO_NAME = repo_name("create_commit_create_pr") self._api.create_repo(repo_id=REPO_NAME, exist_ok=False) From 3bfe94699b5ba182e6fc11f8a46b00a59dd015e7 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 27 Nov 2023 17:12:56 +0100 Subject: [PATCH 3/4] do not respect gitignore in --- src/huggingface_hub/hf_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 3d382b57ad..d65c9fae4b 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -4229,6 +4229,7 @@ def upload_file( revision=revision, create_pr=create_pr, parent_commit=parent_commit, + respect_gitignore=False, # force upload when uploading a single file ) if commit_info.pr_url is not None: From 4ca66e233d2c417f6968001efe44054e906a8dc7 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Mon, 27 Nov 2023 17:16:59 +0100 Subject: [PATCH 4/4] add param in upload_fike --- src/huggingface_hub/hf_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index d65c9fae4b..2cee6049aa 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -4098,6 +4098,7 @@ def upload_file( commit_description: Optional[str] = None, create_pr: Optional[bool] = None, parent_commit: Optional[str] = None, + respect_gitignore: bool = True, run_as_future: bool = False, ) -> Union[str, Future[str]]: """ @@ -4140,6 +4141,8 @@ def upload_file( If specified and `create_pr` is `True`, the pull request will be created from `parent_commit`. Specifying `parent_commit` ensures the repo has not changed before committing the changes, and can be especially useful if the repo is updated / committed to concurrently. + respect_gitignore (`bool`, *optional*): + Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`. run_as_future (`bool`, *optional*): Whether or not to run this method in the background. Background jobs are run sequentially without blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) @@ -4229,7 +4232,7 @@ def upload_file( revision=revision, create_pr=create_pr, parent_commit=parent_commit, - respect_gitignore=False, # force upload when uploading a single file + respect_gitignore=respect_gitignore, ) if commit_info.pr_url is not None: