Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Respect .gitignore file in commits #1868

Merged
merged 5 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions docs/source/en/guides/upload.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,15 @@ folder to. Depending on your repository type, you can optionally set the reposit
... )
```

Use the `allow_patterns` and `ignore_patterns` arguments to specify which files to upload. These parameters accept either a single pattern or a list of patterns.
Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm).
If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded.
By default, the `.gitignore` file will be taken into account to know which files should
be committed or not. By default we check if a `.gitignore` file is present in a commit, and if not, we check if it exists on the Hub. If you want to force the upload no matter
the `.gitignore` file, you can pass `respect_gitignore=False`. Please be aware that only
a `.gitignore` file present at the root of the directory with be used. We do not check
for `.gitignore` files in subdirectories.

Any `.git/` folder present in any subdirectory will be ignored. However, please be aware that the `.gitignore` file is not taken into account.
This means you must use `allow_patterns` and `ignore_patterns` to specify which files to upload instead.
If you don't want to use an hardcoded `.gitignore` file, you can use the `allow_patterns` and `ignore_patterns` arguments to filter which files to upload. These parameters accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply.

Beside the `.gitignore` file and allow/ignore patterns, any `.git/` folder present in any subdirectory will be ignored.

```py
>>> api.upload_folder(
Expand Down
47 changes: 38 additions & 9 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,19 @@ class CommitOperationAdd:
upload_info: UploadInfo = field(init=False, repr=False)

# Internal attributes
_upload_mode: Optional[UploadMode] = field(
init=False, repr=False, default=None
) # set to "lfs" or "regular" once known
_is_uploaded: bool = field(
init=False, repr=False, default=False
) # set to True once the file has been uploaded as LFS
_is_committed: bool = field(init=False, repr=False, default=False) # set to True once the file has been committed

# set to "lfs" or "regular" once known
_upload_mode: Optional[UploadMode] = field(init=False, repr=False, default=None)

# set to True if .gitignore rules prevent the file from being uploaded as LFS
# (server-side check)
_should_ignore: Optional[bool] = field(init=False, repr=False, default=None)

# set to True once the file has been uploaded as LFS
_is_uploaded: bool = field(init=False, repr=False, default=False)

# set to True once the file has been committed
_is_committed: bool = field(init=False, repr=False, default=False)

def __post_init__(self) -> None:
"""Validates `path_or_fileobj` and compute `upload_info`."""
Expand Down Expand Up @@ -439,6 +445,7 @@ def _fetch_upload_modes(
revision: str,
endpoint: Optional[str] = None,
create_pr: bool = False,
gitignore_content: Optional[str] = None,
) -> None:
"""
Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob
Expand All @@ -457,7 +464,11 @@ def _fetch_upload_modes(
An authentication token ( See https://huggingface.co/settings/tokens )
revision (`str`):
The git revision to upload the files to. Can be any valid git revision.

gitignore_content (`str`, *optional*):
The content of the `.gitignore` file to know which files should be ignored. The order of priority
is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
(if any).
Raises:
[`~utils.HfHubHTTPError`]
If the Hub API returned an error.
Expand All @@ -469,8 +480,10 @@ def _fetch_upload_modes(

# Fetch upload mode (LFS or regular) chunk by chunk.
upload_modes: Dict[str, UploadMode] = {}
should_ignore_info: Dict[str, bool] = {}

for chunk in chunk_iterable(additions, 256):
payload = {
payload: Dict = {
"files": [
{
"path": op.path_in_repo,
Expand All @@ -481,6 +494,8 @@ def _fetch_upload_modes(
for op in chunk
]
}
if gitignore_content is not None:
payload["gitIgnore"] = gitignore_content

resp = get_session().post(
f"{endpoint}/api/{repo_type}s/{repo_id}/preupload/{revision}",
Expand All @@ -491,10 +506,12 @@ def _fetch_upload_modes(
hf_raise_for_status(resp)
preupload_info = _validate_preupload_info(resp.json())
upload_modes.update(**{file["path"]: file["uploadMode"] for file in preupload_info["files"]})
should_ignore_info.update(**{file["path"]: file["shouldIgnore"] for file in preupload_info["files"]})

# Set upload mode for each addition operation
for addition in additions:
addition._upload_mode = upload_modes[addition.path_in_repo]
addition._should_ignore = should_ignore_info[addition.path_in_repo]

# Empty files cannot be uploaded as LFS (S3 would fail with a 501 Not Implemented)
# => empty files are uploaded as "regular" to still allow users to commit them.
Expand Down Expand Up @@ -571,6 +588,7 @@ def _prepare_commit_payload(
commit_message: str,
commit_description: Optional[str] = None,
parent_commit: Optional[str] = None,
respect_gitignore: bool = True,
) -> Iterable[Dict[str, Any]]:
"""
Builds the payload to POST to the `/commit` API of the Hub.
Expand All @@ -590,8 +608,16 @@ def _prepare_commit_payload(
header_value["parentCommit"] = parent_commit
yield {"key": "header", "value": header_value}

nb_ignored_files = 0

# 2. Send operations, one per line
for operation in operations:
# Skip ignored files
if respect_gitignore and isinstance(operation, CommitOperationAdd) and operation._should_ignore:
logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
nb_ignored_files += 1
continue

# 2.a. Case adding a regular file
if isinstance(operation, CommitOperationAdd) and operation._upload_mode == "regular":
yield {
Expand Down Expand Up @@ -638,3 +664,6 @@ def _prepare_commit_payload(
f"Unknown operation to commit. Operation: {operation}. Upload mode:"
f" {getattr(operation, '_upload_mode', None)}"
)

if nb_ignored_files > 0:
logger.info(f"Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).")
66 changes: 64 additions & 2 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3332,6 +3332,7 @@ def create_commit( # type: ignore
create_pr: Optional[bool] = None,
num_threads: int = 5,
parent_commit: Optional[str] = None,
respect_gitignore: bool = True,
run_as_future: Literal[False] = ...,
) -> CommitInfo:
...
Expand All @@ -3350,6 +3351,7 @@ def create_commit(
create_pr: Optional[bool] = None,
num_threads: int = 5,
parent_commit: Optional[str] = None,
respect_gitignore: bool = True,
run_as_future: Literal[True] = ...,
) -> Future[CommitInfo]:
...
Expand All @@ -3369,6 +3371,7 @@ def create_commit(
create_pr: Optional[bool] = None,
num_threads: int = 5,
parent_commit: Optional[str] = None,
respect_gitignore: bool = True,
run_as_future: bool = False,
) -> Union[CommitInfo, Future[CommitInfo]]:
"""
Expand Down Expand Up @@ -3447,6 +3450,8 @@ def create_commit(
is `True`, the pull request will be created from `parent_commit`. Specifying `parent_commit`
ensures the repo has not changed before committing the changes, and can be especially useful
if the repo is updated / committed to concurrently.
respect_gitignore (`bool`, *optional*):
Whether to respect the `.gitignore` file in the repo. Defaults to `True`.
run_as_future (`bool`, *optional*):
Whether or not to run this method in the background. Background jobs are run sequentially without
blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects)
Expand Down Expand Up @@ -3518,6 +3523,7 @@ def create_commit(
create_pr=create_pr,
num_threads=num_threads,
free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users
respect_gitignore=respect_gitignore,
)
files_to_copy = _fetch_lfs_files_to_copy(
copies=copies,
Expand All @@ -3533,6 +3539,7 @@ def create_commit(
commit_message=commit_message,
commit_description=commit_description,
parent_commit=parent_commit,
respect_gitignore=respect_gitignore,
)
commit_url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/commit/{revision}"

Expand Down Expand Up @@ -3591,6 +3598,7 @@ def create_commits_on_pr(
merge_pr: bool = True,
num_threads: int = 5, # TODO: use to multithread uploads
verbose: bool = False,
respect_gitignore: bool = True,
) -> str:
"""Push changes to the Hub in multiple commits.

Expand Down Expand Up @@ -3651,6 +3659,9 @@ def create_commits_on_pr(
If set to `True`, process will run on verbose mode i.e. print information about the ongoing tasks.
Defaults to `False`.

respect_gitignore (`bool`, *optional*):
Whether to respect the `.gitignore` file in the repo. Defaults to `True`.

Returns:
`str`: URL to the created PR.

Expand Down Expand Up @@ -3799,6 +3810,7 @@ def create_commits_on_pr(
num_threads=num_threads,
operations=step.operations,
create_pr=False,
respect_gitignore=respect_gitignore,
)
step.completed = True
nb_remaining -= 1
Expand Down Expand Up @@ -3887,6 +3899,8 @@ def preupload_lfs_files(
create_pr: Optional[bool] = None,
num_threads: int = 5,
free_memory: bool = True,
respect_gitignore: bool = True,
gitignore_content: Optional[str] = None,
):
"""Pre-upload LFS files to S3 in preparation on a future commit.

Expand Down Expand Up @@ -3933,6 +3947,15 @@ def preupload_lfs_files(
Number of concurrent threads for uploading files. Defaults to 5.
Setting it to 2 means at most 2 files will be uploaded concurrently.

respect_gitignore (`bool`, *optional*):
Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`.

gitignore_content (`str`, *optional*):
The content of the `.gitignore` file to know which files should be ignored. The order of priority
is to first check if `gitignore_content` is passed, then check if the `.gitignore` file is present
in the list of files to commit and finally default to the `.gitignore` file already hosted on the Hub
(if any).

Example:
```py
>>> from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit, create_repo
Expand All @@ -3957,6 +3980,15 @@ def preupload_lfs_files(
revision = quote(revision, safe="") if revision is not None else DEFAULT_REVISION
create_pr = create_pr if create_pr is not None else False

# Check if a `gitignore` file is being committed to the Hub.
additions = list(additions)
if gitignore_content is None:
for addition in additions:
if addition.path_in_repo == ".gitignore":
with addition.as_file() as f:
gitignore_content = f.read().decode()
break

# Filter out already uploaded files
new_additions = [addition for addition in additions if not addition._is_uploaded]

Expand All @@ -3970,6 +4002,7 @@ def preupload_lfs_files(
revision=revision,
endpoint=self.endpoint,
create_pr=create_pr or False,
gitignore_content=gitignore_content,
)
except RepositoryNotFoundError as e:
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
Expand All @@ -3978,9 +4011,27 @@ def preupload_lfs_files(
# Filter out regular files
new_lfs_additions = [addition for addition in new_additions if addition._upload_mode == "lfs"]

# Filter out files listed in .gitignore
if respect_gitignore:
new_lfs_additions_to_upload = []
for addition in new_lfs_additions:
if addition._should_ignore:
logger.debug(
f"Skipping upload for LFS file '{addition.path_in_repo}' (ignored by gitignore file)."
)
else:
new_lfs_additions_to_upload.append(addition)
if len(new_lfs_additions) != len(new_lfs_additions_to_upload):
logger.info(
f"Skipped upload for {len(new_lfs_additions) - len(new_lfs_additions_to_upload)} LFS file(s) "
"(ignored by gitignore file)."
)
else:
new_lfs_additions_to_upload = new_lfs_additions

# Upload new LFS files
_upload_lfs_files(
additions=new_lfs_additions,
additions=new_lfs_additions_to_upload,
repo_type=repo_type,
repo_id=repo_id,
token=token or self.token,
Expand All @@ -3991,7 +4042,7 @@ def preupload_lfs_files(
# PR (i.e. `revision`).
revision=revision if not create_pr else None,
)
for addition in new_lfs_additions:
for addition in new_lfs_additions_to_upload:
addition._is_uploaded = True
if free_memory:
addition.path_or_fileobj = b""
Expand Down Expand Up @@ -4047,6 +4098,7 @@ def upload_file(
commit_description: Optional[str] = None,
create_pr: Optional[bool] = None,
parent_commit: Optional[str] = None,
respect_gitignore: bool = True,
run_as_future: bool = False,
) -> Union[str, Future[str]]:
"""
Expand Down Expand Up @@ -4089,6 +4141,8 @@ def upload_file(
If specified and `create_pr` is `True`, the pull request will be created from `parent_commit`.
Specifying `parent_commit` ensures the repo has not changed before committing the changes, and can be
especially useful if the repo is updated / committed to concurrently.
respect_gitignore (`bool`, *optional*):
Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`.
run_as_future (`bool`, *optional*):
Whether or not to run this method in the background. Background jobs are run sequentially without
blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects)
Expand Down Expand Up @@ -4178,6 +4232,7 @@ def upload_file(
revision=revision,
create_pr=create_pr,
parent_commit=parent_commit,
respect_gitignore=respect_gitignore,
)

if commit_info.pr_url is not None:
Expand Down Expand Up @@ -4207,6 +4262,7 @@ def upload_folder( # type: ignore
delete_patterns: Optional[Union[List[str], str]] = None,
multi_commits: bool = False,
multi_commits_verbose: bool = False,
respect_gitignore: bool = True,
run_as_future: Literal[False] = ...,
) -> str:
...
Expand All @@ -4230,6 +4286,7 @@ def upload_folder(
delete_patterns: Optional[Union[List[str], str]] = None,
multi_commits: bool = False,
multi_commits_verbose: bool = False,
respect_gitignore: bool = True,
run_as_future: Literal[True] = ...,
) -> Future[str]:
...
Expand All @@ -4254,6 +4311,7 @@ def upload_folder(
delete_patterns: Optional[Union[List[str], str]] = None,
multi_commits: bool = False,
multi_commits_verbose: bool = False,
respect_gitignore: bool = True,
run_as_future: bool = False,
) -> Union[str, Future[str]]:
"""
Expand Down Expand Up @@ -4326,6 +4384,8 @@ def upload_folder(
If True, changes are pushed to a PR using a multi-commit process. Defaults to `False`.
multi_commits_verbose (`bool`):
If True and `multi_commits` is used, more information will be displayed to the user.
respect_gitignore (`bool`, *optional*):
Whether or not to respect the `.gitignore` file in the repo. Defaults to `True`.
run_as_future (`bool`, *optional*):
Whether or not to run this method in the background. Background jobs are run sequentially without
blocking the main thread. Passing `run_as_future=True` will return a [Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects)
Expand Down Expand Up @@ -4454,6 +4514,7 @@ def upload_folder(
token=token,
merge_pr=not create_pr,
verbose=multi_commits_verbose,
respect_gitignore=respect_gitignore,
)
else:
commit_info = self.create_commit(
Expand All @@ -4466,6 +4527,7 @@ def upload_folder(
revision=revision,
create_pr=create_pr,
parent_commit=parent_commit,
respect_gitignore=respect_gitignore,
)
pr_url = commit_info.pr_url

Expand Down
Loading
Loading