diff --git a/airflow-core/newsfragments/57069.significant.rst b/airflow-core/newsfragments/57069.significant.rst new file mode 100644 index 0000000000000..a4d1c22bb7af8 --- /dev/null +++ b/airflow-core/newsfragments/57069.significant.rst @@ -0,0 +1,16 @@ +Git provider: Remove '.git' folder from versions in GitDagBundle + +A new option(``prune_dotgit_folder``) has been added to the GitDagBundle to remove ``.git`` from +versioned bundles by default to reduce disk usage; set prune_dotgit_folder=False to keep +repo metadata in the dag bundle's versions folders. + +* Types of change + + * [ ] Dag changes + * [ ] Config changes + * [ ] API changes + * [ ] CLI changes + * [x] Behaviour changes + * [ ] Plugin changes + * [ ] Dependency changes + * [ ] Code interface changes diff --git a/providers/git/docs/bundles/index.rst b/providers/git/docs/bundles/index.rst index 4e9cb5dff2271..30e6ade89896b 100644 --- a/providers/git/docs/bundles/index.rst +++ b/providers/git/docs/bundles/index.rst @@ -35,6 +35,7 @@ Example of using the GitDagBundle: "subdir": "dags", "tracking_ref": "main", "refresh_interval": 3600 + "prune_dotgit_folder": True } } ]' diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 2bf1f9980ae63..3418092f0d7bc 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -45,6 +45,12 @@ class GitDagBundle(BaseDagBundle): :param subdir: Subdirectory within the repository where the DAGs are stored (Optional) :param git_conn_id: Connection ID for SSH/token based connection to the repository (Optional) :param repo_url: Explicit Git repository URL to override the connection's host. (Optional) + :param prune_dotgit_folder: Remove .git folder from the versions after cloning. + + The per-version clone is not a full "git" copy (it makes use of git's `--local` ability + to share the object directory via hard links, but if you have a lot of current versions + running, or an especially large git repo leaving this as True will save some disk space + at the expense of `git` operations not working in the bundle that Tasks run from. """ supports_versioning = True @@ -56,6 +62,7 @@ def __init__( subdir: str | None = None, git_conn_id: str | None = None, repo_url: str | None = None, + prune_dotgit_folder: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) @@ -68,6 +75,7 @@ def __init__( self.repo_path = self.base_dir / "tracking_repo" self.git_conn_id = git_conn_id self.repo_url = repo_url + self.prune_dotgit_folder = prune_dotgit_folder self._log = log.bind( bundle_name=self.name, @@ -115,6 +123,8 @@ def _initialize(self): self.repo.remotes.origin.fetch() self.repo.head.set_reference(str(self.repo.commit(self.version))) self.repo.head.reset(index=True, working_tree=True) + if self.prune_dotgit_folder: + shutil.rmtree(self.repo_path / ".git") else: self.refresh() self.repo.close() diff --git a/providers/git/tests/unit/git/bundles/test_git.py b/providers/git/tests/unit/git/bundles/test_git.py index b6c110e35477b..824b85ed4b3c7 100644 --- a/providers/git/tests/unit/git/bundles/test_git.py +++ b/providers/git/tests/unit/git/bundles/test_git.py @@ -165,6 +165,7 @@ def test_get_specific_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, + prune_dotgit_folder=False, ) bundle.initialize() @@ -196,6 +197,7 @@ def test_get_tag_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version="test", tracking_ref=GIT_DEFAULT_BRANCH, + prune_dotgit_folder=False, ) bundle.initialize() assert bundle.get_current_version() == starting_commit.hexsha @@ -225,6 +227,47 @@ def test_get_latest(self, mock_githook, git_repo): assert_repo_is_closed(bundle) + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_removes_git_dir_for_versioned_bundle_by_default(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + starting_commit = repo.head.commit + + bundle = GitDagBundle( + name="test", + git_conn_id=CONN_HTTPS, + version=starting_commit.hexsha, + tracking_ref=GIT_DEFAULT_BRANCH, + ) + bundle.initialize() + + assert not (bundle.repo_path / ".git").exists() + + files_in_repo = {f.name for f in bundle.path.iterdir() if f.is_file()} + assert {"test_dag.py"} == files_in_repo + + assert_repo_is_closed(bundle) + + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_keeps_git_dir_when_disabled(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + starting_commit = repo.head.commit + + bundle = GitDagBundle( + name="test", + git_conn_id=CONN_HTTPS, + version=starting_commit.hexsha, + tracking_ref=GIT_DEFAULT_BRANCH, + prune_dotgit_folder=False, + ) + bundle.initialize() + + assert (bundle.repo_path / ".git").exists() + assert bundle.get_current_version() == starting_commit.hexsha + + assert_repo_is_closed(bundle) + @pytest.mark.parametrize( "amend", [