From 4e5801efcc15ba12678d5da6c0e55aeea3ac6ab7 Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Wed, 22 Oct 2025 15:50:56 +0100 Subject: [PATCH 1/7] Remove '.git' folder from versions in GitDagBundle To reduce storage size, this PR removes the git repo in versions when they are created since the git repo is not necessary on versions. --- .../src/airflow/providers/git/bundles/git.py | 5 +++ .../git/tests/unit/git/bundles/test_git.py | 43 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 2bf1f9980ae63..7d23447484b6e 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -45,6 +45,7 @@ class GitDagBundle(BaseDagBundle): :param subdir: Subdirectory within the repository where the DAGs are stored (Optional) :param git_conn_id: Connection ID for SSH/token based connection to the repository (Optional) :param repo_url: Explicit Git repository URL to override the connection's host. (Optional) + :param remove_git_repo_on_versions: Remove .git folder from the versions after cloning """ supports_versioning = True @@ -56,6 +57,7 @@ def __init__( subdir: str | None = None, git_conn_id: str | None = None, repo_url: str | None = None, + remove_git_repo_on_versions: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) @@ -68,6 +70,7 @@ def __init__( self.repo_path = self.base_dir / "tracking_repo" self.git_conn_id = git_conn_id self.repo_url = repo_url + self.remove_git_repo_on_versions = remove_git_repo_on_versions self._log = log.bind( bundle_name=self.name, @@ -115,6 +118,8 @@ def _initialize(self): self.repo.remotes.origin.fetch() self.repo.head.set_reference(str(self.repo.commit(self.version))) self.repo.head.reset(index=True, working_tree=True) + if self.remove_git_repo_on_versions: + shutil.rmtree(self.repo_path / ".git") else: self.refresh() self.repo.close() diff --git a/providers/git/tests/unit/git/bundles/test_git.py b/providers/git/tests/unit/git/bundles/test_git.py index b6c110e35477b..74eeb90256350 100644 --- a/providers/git/tests/unit/git/bundles/test_git.py +++ b/providers/git/tests/unit/git/bundles/test_git.py @@ -165,6 +165,7 @@ def test_get_specific_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, + remove_git_repo_on_versions=False, ) bundle.initialize() @@ -196,6 +197,7 @@ def test_get_tag_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version="test", tracking_ref=GIT_DEFAULT_BRANCH, + remove_git_repo_on_versions=False, ) bundle.initialize() assert bundle.get_current_version() == starting_commit.hexsha @@ -225,6 +227,47 @@ def test_get_latest(self, mock_githook, git_repo): assert_repo_is_closed(bundle) + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_removes_git_dir_for_versioned_bundle_by_default(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + starting_commit = repo.head.commit + + bundle = GitDagBundle( + name="test", + git_conn_id=CONN_HTTPS, + version=starting_commit.hexsha, + tracking_ref=GIT_DEFAULT_BRANCH, + ) + bundle.initialize() + + assert not (bundle.repo_path / ".git").exists() + + files_in_repo = {f.name for f in bundle.path.iterdir() if f.is_file()} + assert {"test_dag.py"} == files_in_repo + + assert_repo_is_closed(bundle) + + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_keeps_git_dir_when_disabled(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + starting_commit = repo.head.commit + + bundle = GitDagBundle( + name="test", + git_conn_id=CONN_HTTPS, + version=starting_commit.hexsha, + tracking_ref=GIT_DEFAULT_BRANCH, + remove_git_repo_on_versions=False, + ) + bundle.initialize() + + assert (bundle.repo_path / ".git").exists() + assert bundle.get_current_version() == starting_commit.hexsha + + assert_repo_is_closed(bundle) + @pytest.mark.parametrize( "amend", [ From fa0de01d471479e6233a428bc466f73f92b66db3 Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Wed, 22 Oct 2025 17:30:05 +0100 Subject: [PATCH 2/7] Apply suggestion from @ashb Co-authored-by: Ash Berlin-Taylor --- providers/git/src/airflow/providers/git/bundles/git.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 7d23447484b6e..ed969da6dde76 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -45,7 +45,12 @@ class GitDagBundle(BaseDagBundle): :param subdir: Subdirectory within the repository where the DAGs are stored (Optional) :param git_conn_id: Connection ID for SSH/token based connection to the repository (Optional) :param repo_url: Explicit Git repository URL to override the connection's host. (Optional) - :param remove_git_repo_on_versions: Remove .git folder from the versions after cloning + :param remove_git_repo_on_versions: Remove .git folder from the versions after cloning. + + The per-version clone is not a full "git" copy (it makes uses of git's `--local` ability + to share the object directory via hardlinks, but if you have a lot of current versions + running, or an especially large git repo setting this to False will save some disk space + at the expense of `git` operations not working in the bundle that Tasks run from. """ supports_versioning = True From e2cc01799bd08e546df053f7980a6efdd1d4b6ef Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Wed, 22 Oct 2025 17:36:22 +0100 Subject: [PATCH 3/7] fixup! Apply suggestion from @ashb --- .../git/src/airflow/providers/git/bundles/git.py | 12 ++++++------ providers/git/tests/unit/git/bundles/test_git.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index ed969da6dde76..15e85000960c2 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -45,11 +45,11 @@ class GitDagBundle(BaseDagBundle): :param subdir: Subdirectory within the repository where the DAGs are stored (Optional) :param git_conn_id: Connection ID for SSH/token based connection to the repository (Optional) :param repo_url: Explicit Git repository URL to override the connection's host. (Optional) - :param remove_git_repo_on_versions: Remove .git folder from the versions after cloning. + :param remove_git_folder_from_versions: Remove .git folder from the versions after cloning. - The per-version clone is not a full "git" copy (it makes uses of git's `--local` ability + The per-version clone is not a full "git" copy (it makes use of git's `--local` ability to share the object directory via hardlinks, but if you have a lot of current versions - running, or an especially large git repo setting this to False will save some disk space + running, or an especially large git repo setting this to True will save some disk space at the expense of `git` operations not working in the bundle that Tasks run from. """ @@ -62,7 +62,7 @@ def __init__( subdir: str | None = None, git_conn_id: str | None = None, repo_url: str | None = None, - remove_git_repo_on_versions: bool = True, + remove_git_folder_from_versions: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) @@ -75,7 +75,7 @@ def __init__( self.repo_path = self.base_dir / "tracking_repo" self.git_conn_id = git_conn_id self.repo_url = repo_url - self.remove_git_repo_on_versions = remove_git_repo_on_versions + self.remove_git_folder_from_versions = remove_git_folder_from_versions self._log = log.bind( bundle_name=self.name, @@ -123,7 +123,7 @@ def _initialize(self): self.repo.remotes.origin.fetch() self.repo.head.set_reference(str(self.repo.commit(self.version))) self.repo.head.reset(index=True, working_tree=True) - if self.remove_git_repo_on_versions: + if self.remove_git_folder_from_versions: shutil.rmtree(self.repo_path / ".git") else: self.refresh() diff --git a/providers/git/tests/unit/git/bundles/test_git.py b/providers/git/tests/unit/git/bundles/test_git.py index 74eeb90256350..6735a6cfe5c83 100644 --- a/providers/git/tests/unit/git/bundles/test_git.py +++ b/providers/git/tests/unit/git/bundles/test_git.py @@ -165,7 +165,7 @@ def test_get_specific_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_repo_on_versions=False, + remove_git_folder_from_versions=False, ) bundle.initialize() @@ -197,7 +197,7 @@ def test_get_tag_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version="test", tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_repo_on_versions=False, + remove_git_folder_from_versions=False, ) bundle.initialize() assert bundle.get_current_version() == starting_commit.hexsha @@ -259,7 +259,7 @@ def test_keeps_git_dir_when_disabled(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_repo_on_versions=False, + remove_git_folder_from_versions=False, ) bundle.initialize() From 4014c34c3a18852fce33374aaa0bd7f55ef650c5 Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Wed, 22 Oct 2025 18:00:35 +0100 Subject: [PATCH 4/7] Rename arg to prune_dotgit_folder --- providers/git/src/airflow/providers/git/bundles/git.py | 8 ++++---- providers/git/tests/unit/git/bundles/test_git.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 15e85000960c2..7b78f99c82c29 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -45,7 +45,7 @@ class GitDagBundle(BaseDagBundle): :param subdir: Subdirectory within the repository where the DAGs are stored (Optional) :param git_conn_id: Connection ID for SSH/token based connection to the repository (Optional) :param repo_url: Explicit Git repository URL to override the connection's host. (Optional) - :param remove_git_folder_from_versions: Remove .git folder from the versions after cloning. + :param prune_dotgit_folder: Remove .git folder from the versions after cloning. The per-version clone is not a full "git" copy (it makes use of git's `--local` ability to share the object directory via hardlinks, but if you have a lot of current versions @@ -62,7 +62,7 @@ def __init__( subdir: str | None = None, git_conn_id: str | None = None, repo_url: str | None = None, - remove_git_folder_from_versions: bool = True, + prune_dotgit_folder: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) @@ -75,7 +75,7 @@ def __init__( self.repo_path = self.base_dir / "tracking_repo" self.git_conn_id = git_conn_id self.repo_url = repo_url - self.remove_git_folder_from_versions = remove_git_folder_from_versions + self.prune_dotgit_folder = prune_dotgit_folder self._log = log.bind( bundle_name=self.name, @@ -123,7 +123,7 @@ def _initialize(self): self.repo.remotes.origin.fetch() self.repo.head.set_reference(str(self.repo.commit(self.version))) self.repo.head.reset(index=True, working_tree=True) - if self.remove_git_folder_from_versions: + if self.prune_dotgit_folder: shutil.rmtree(self.repo_path / ".git") else: self.refresh() diff --git a/providers/git/tests/unit/git/bundles/test_git.py b/providers/git/tests/unit/git/bundles/test_git.py index 6735a6cfe5c83..824b85ed4b3c7 100644 --- a/providers/git/tests/unit/git/bundles/test_git.py +++ b/providers/git/tests/unit/git/bundles/test_git.py @@ -165,7 +165,7 @@ def test_get_specific_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_folder_from_versions=False, + prune_dotgit_folder=False, ) bundle.initialize() @@ -197,7 +197,7 @@ def test_get_tag_version(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version="test", tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_folder_from_versions=False, + prune_dotgit_folder=False, ) bundle.initialize() assert bundle.get_current_version() == starting_commit.hexsha @@ -259,7 +259,7 @@ def test_keeps_git_dir_when_disabled(self, mock_githook, git_repo): git_conn_id=CONN_HTTPS, version=starting_commit.hexsha, tracking_ref=GIT_DEFAULT_BRANCH, - remove_git_folder_from_versions=False, + prune_dotgit_folder=False, ) bundle.initialize() From cb908f7ee14f568a66081d1c3459f93c0f7e8370 Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Fri, 24 Oct 2025 18:14:42 +0100 Subject: [PATCH 5/7] Add news fragment item --- airflow-core/newsfragments/57069.significant.rst | 16 ++++++++++++++++ providers/git/docs/bundles/index.rst | 1 + 2 files changed, 17 insertions(+) create mode 100644 airflow-core/newsfragments/57069.significant.rst diff --git a/airflow-core/newsfragments/57069.significant.rst b/airflow-core/newsfragments/57069.significant.rst new file mode 100644 index 0000000000000..a4d1c22bb7af8 --- /dev/null +++ b/airflow-core/newsfragments/57069.significant.rst @@ -0,0 +1,16 @@ +Git provider: Remove '.git' folder from versions in GitDagBundle + +A new option(``prune_dotgit_folder``) has been added to the GitDagBundle to remove ``.git`` from +versioned bundles by default to reduce disk usage; set prune_dotgit_folder=False to keep +repo metadata in the dag bundle's versions folders. + +* Types of change + + * [ ] Dag changes + * [ ] Config changes + * [ ] API changes + * [ ] CLI changes + * [x] Behaviour changes + * [ ] Plugin changes + * [ ] Dependency changes + * [ ] Code interface changes diff --git a/providers/git/docs/bundles/index.rst b/providers/git/docs/bundles/index.rst index 4e9cb5dff2271..30e6ade89896b 100644 --- a/providers/git/docs/bundles/index.rst +++ b/providers/git/docs/bundles/index.rst @@ -35,6 +35,7 @@ Example of using the GitDagBundle: "subdir": "dags", "tracking_ref": "main", "refresh_interval": 3600 + "prune_dotgit_folder": True } } ]' From 020a5b91ef5c2e0723eec0c219155c29c9f128ed Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Fri, 24 Oct 2025 20:34:51 +0100 Subject: [PATCH 6/7] Fix typo --- providers/git/src/airflow/providers/git/bundles/git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 7b78f99c82c29..46c57ceacfb66 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -48,7 +48,7 @@ class GitDagBundle(BaseDagBundle): :param prune_dotgit_folder: Remove .git folder from the versions after cloning. The per-version clone is not a full "git" copy (it makes use of git's `--local` ability - to share the object directory via hardlinks, but if you have a lot of current versions + to share the object directory via hard links, but if you have a lot of current versions running, or an especially large git repo setting this to True will save some disk space at the expense of `git` operations not working in the bundle that Tasks run from. """ From 9eea315a91ffd46855bc7823f859ceee71c5655f Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Mon, 27 Oct 2025 12:01:00 +0100 Subject: [PATCH 7/7] Update comment about prune_dotgit_folder --- providers/git/src/airflow/providers/git/bundles/git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 46c57ceacfb66..3418092f0d7bc 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -49,7 +49,7 @@ class GitDagBundle(BaseDagBundle): The per-version clone is not a full "git" copy (it makes use of git's `--local` ability to share the object directory via hard links, but if you have a lot of current versions - running, or an especially large git repo setting this to True will save some disk space + running, or an especially large git repo leaving this as True will save some disk space at the expense of `git` operations not working in the bundle that Tasks run from. """