Skip to content

Commit 57c171e

Browse files
committed
feat: Add helper to git clone and tarball
The new helper `git_clone_and_tarball()` is an opinionated function to clone a git repo from tag/ref with submodules and add the content into a tar ball. The tar ball does not include a `.git` directory, but it comes with a `.git_archival.txt` for `setuptools-scm`. An optional callback can be used to rewrite clone URLs. The base name of the tar ball and first-level directory are always based on the `prefix` argument. Signed-off-by: Christian Heimes <cheimes@redhat.com>
1 parent e0dc831 commit 57c171e

File tree

3 files changed

+326
-14
lines changed

3 files changed

+326
-14
lines changed

src/fromager/gitutils.py

Lines changed: 262 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import logging
2+
import os
23
import pathlib
4+
import re
5+
import tarfile
6+
import tempfile
7+
import typing
38
from urllib.parse import urlparse
49

510
from packaging.requirements import Requirement
11+
from packaging.utils import canonicalize_name
12+
from packaging.version import Version
613

7-
from fromager import context, external_commands
14+
from . import context, external_commands, tarballs
815

916
logger = logging.getLogger(__name__)
1017

@@ -61,3 +68,257 @@ def git_clone(
6168
)
6269

6370
return output_dir
71+
72+
73+
class BeforeSubmoduleCallback(typing.Protocol):
74+
"""Before submodule update callback"""
75+
76+
def __call__(self, *, clonedir: pathlib.Path, name: str, remote: str) -> None:
77+
pass
78+
79+
80+
def git_clone_and_tarball(
81+
*,
82+
destdir: pathlib.Path,
83+
prefix: tuple[Requirement, Version] | str,
84+
repo_url: str,
85+
tag: str | None = None,
86+
ref: str | None = None,
87+
before_submodule_update: BeforeSubmoduleCallback | None = None,
88+
git_archival_tag_match: str | None = None,
89+
) -> pathlib.Path:
90+
"""Clone a git repository and generate a ball
91+
92+
This function creates a tar ball from a remote URL, with all submodules
93+
(non-recursive), and includes a ``.git_archival.txt`` for setuptools-scm.
94+
95+
:param destdir: directory where the tar ball is stored
96+
:param prefix: prefix of the tar ball and first level directory
97+
:param repo_url: git clone url
98+
:param tag: tag name to clone
99+
:param ref: git ref to clone (mutually exclusive with *tag*)
100+
:param before_submodule_update: callback that runs before
101+
``git submodule update``. The callback is execute for each submodule.
102+
:param git_archival_tag_match: git describe tag pattern for ``.git_archival.txt``
103+
104+
This example code creates a ``xformers-0.0.31.post1.tar.gz`` tar ball:
105+
106+
.. code-block::
107+
108+
def cb(*, clonedir: pathlib.Path, name: str, remote: str) -> None:
109+
subprocess.check_call(
110+
["git", "config", "set", f"submodule.{name}.url", mirror(remote)],
111+
cwd=str(clonedir)
112+
)
113+
114+
req = Requirement("xformers")
115+
tag = "v0.0.31.post1"
116+
version = Version(tag)
117+
repo_url = "https://github.com/facebookresearch/xformers.git"
118+
destdir = pathlib.Path("destdir").absolute()
119+
tarball = git_clone_and_tarball(
120+
prefix=(req, version),
121+
destdir=destdir,
122+
repo_url=repo_url,
123+
tag=tag,
124+
before_submodule_update=cb,
125+
)
126+
"""
127+
if isinstance(prefix, tuple):
128+
req = prefix[0]
129+
version = prefix[1]
130+
assert isinstance(req, Requirement)
131+
assert isinstance(version, Version)
132+
canon_name = canonicalize_name(req.name)
133+
prefix = f"{canon_name}-{version}"
134+
135+
with tempfile.TemporaryDirectory() as tmpdir:
136+
clonedir = pathlib.Path(tmpdir).absolute()
137+
_git_clone(
138+
clonedir=clonedir,
139+
repo_url=repo_url,
140+
tag=tag,
141+
ref=ref,
142+
)
143+
submodules = _git_submodule_list(clonedir=clonedir)
144+
if before_submodule_update is not None:
145+
for name, remote in submodules.items():
146+
before_submodule_update(clonedir=clonedir, name=name, remote=remote)
147+
_get_submodule_update(clonedir=clonedir)
148+
_make_git_archival_txt(
149+
clonedir=clonedir,
150+
tag_match=git_archival_tag_match,
151+
)
152+
tarball = _create_tarball(
153+
clonedir=clonedir,
154+
destdir=destdir,
155+
prefix=prefix,
156+
)
157+
158+
return tarball
159+
160+
161+
def _git_clone(
162+
*,
163+
clonedir: pathlib.Path,
164+
repo_url: str,
165+
tag: str | None,
166+
ref: str | None,
167+
) -> None:
168+
"""Clone a git repository into *clonedir*
169+
170+
Initializes submodules
171+
"""
172+
if not bool(tag) ^ bool(ref):
173+
raise ValueError("tag and ref are mutually exclusive")
174+
175+
# Create a clean URL without any credentials for logging
176+
parsed_url = urlparse(repo_url)
177+
clean_url = parsed_url._replace(netloc=parsed_url.hostname or "").geturl()
178+
logger.info(f"cloning {clean_url}, tag {tag}, ref {ref}, into {clonedir}")
179+
180+
cmd: list[str] = ["git", "clone"]
181+
if tag is not None:
182+
# --branch works with branches and tags, but not with commits
183+
cmd.extend(["--branch", tag, "--depth", "1"])
184+
cmd.extend([repo_url, str(clonedir)])
185+
external_commands.run(cmd, network_isolation=False)
186+
187+
# --branch only works with names, so we have to checkout the reference we
188+
# actually want if it is not a name
189+
if ref is not None:
190+
external_commands.run(
191+
["git", "checkout", "--force", ref],
192+
cwd=str(clonedir),
193+
network_isolation=False,
194+
)
195+
196+
# initialize submodule but do not fetch them, yet, to allow customization.
197+
external_commands.run(
198+
["git", "submodule", "init"],
199+
cwd=str(clonedir),
200+
network_isolation=False,
201+
)
202+
203+
204+
_SUBMODULE_RE = re.compile(r"^submodule\.(.*)\.url=(.*)$")
205+
206+
207+
def _git_submodule_list(*, clonedir: pathlib.Path) -> dict[str, str]:
208+
"""Get submodule mapping of name -> remote
209+
210+
Submodule must be initialized
211+
"""
212+
out = external_commands.run(
213+
["git", "config", "list", "--local"],
214+
cwd=str(clonedir),
215+
network_isolation=False,
216+
)
217+
submodules = {}
218+
for line in out.split("\n"):
219+
if mo := _SUBMODULE_RE.match(line):
220+
name, remote = mo.groups()
221+
submodules[name] = remote
222+
logger.debug(f"found submodules: {submodules}")
223+
return submodules
224+
225+
226+
def _get_submodule_update(*, clonedir) -> None:
227+
"""Update and fetch submodules"""
228+
external_commands.run(
229+
["git", "submodule", "update", "--force", "--depth", "1"],
230+
cwd=str(clonedir),
231+
network_isolation=False,
232+
)
233+
234+
235+
def _make_git_archival_txt(
236+
clonedir: pathlib.Path,
237+
*,
238+
tag_match: str | None = None,
239+
) -> str:
240+
"""Generate a .git_archival.txt file for setuptools-scm
241+
242+
https://setuptools-scm.readthedocs.io/en/latest/usage/#git-archives
243+
"""
244+
if not tag_match:
245+
tag_match = "*[0-9]*"
246+
# ignore existing .git_archive.txt template
247+
# TODO: Figure out how to use an existing file and replace its template variables.
248+
archival = clonedir / ".git_archival.txt"
249+
parts = [
250+
"node: %H", # commit hash
251+
"node-date: %cI", # commit date
252+
f"describe-name: %(describe:tags=true,match={tag_match})", # tag + commits since tags
253+
]
254+
sep = "\n" # cannot use backslash in f-strings on Python 3.11
255+
out = external_commands.run(
256+
[
257+
"git",
258+
"log",
259+
f"--pretty=tformat:{sep.join(parts)}",
260+
"-1",
261+
],
262+
cwd=str(clonedir),
263+
network_isolation=False,
264+
)
265+
archival.write_text(out)
266+
logger.debug(f"Generated {archival} with content: \n{out}")
267+
return out
268+
269+
270+
def _create_tarball(
271+
*,
272+
clonedir: pathlib.Path,
273+
destdir: pathlib.Path,
274+
prefix: str,
275+
) -> pathlib.Path:
276+
"""Create a tarball from a git checkout"""
277+
# check for '/' in prefix
278+
if os.sep in prefix:
279+
raise ValueError(f"{prefix=} cannot contain {os.sep}")
280+
281+
tarball = destdir / f"{prefix}.tar.gz"
282+
if tarball.is_file():
283+
logger.debug(f"removing stale tar ball {tarball}")
284+
tarball.unlink()
285+
286+
with tarfile.open(tarball, "x:gz", format=tarfile.PAX_FORMAT) as tar:
287+
tarballs.tar_reproducible_with_prefix(
288+
tar=tar,
289+
basedir=clonedir,
290+
prefix=prefix,
291+
exclude_vcs=True,
292+
)
293+
return tarball
294+
295+
296+
def test():
297+
logging.basicConfig(level=logging.DEBUG)
298+
299+
def cb(*, clonedir: pathlib.Path, name: str, remote: str) -> None:
300+
print(name, remote)
301+
302+
if True:
303+
tag = "v0.0.31.post1"
304+
version = Version(tag)
305+
req = Requirement("xformers")
306+
repo_url = "https://github.com/facebookresearch/xformers.git"
307+
else:
308+
tag = "0.54.0"
309+
version = Version(tag)
310+
req = Requirement("fromager")
311+
repo_url = "https://github.com/python-wheel-build/fromager.git"
312+
destdir = pathlib.Path(".").absolute()
313+
tarball = git_clone_and_tarball(
314+
destdir=destdir,
315+
prefix=(req, version),
316+
repo_url=repo_url,
317+
tag=tag,
318+
before_submodule_update=cb,
319+
)
320+
print(tarball)
321+
322+
323+
if __name__ == "__main__":
324+
test()

src/fromager/tarballs.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pathlib
55
import stat
66
import tarfile
7+
import typing
78

89
VCS_DIRS = {".bzr", ".git", ".hg", ".svn"}
910

@@ -24,6 +25,24 @@ def _tar_reset(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
2425
return tarinfo
2526

2627

28+
def _tar_content(
29+
*, basedir: pathlib.Path, exclude_vcs: bool = False
30+
) -> typing.Iterable[str]:
31+
content: list[str] = [str(basedir)] # include root
32+
for root, dirs, files in os.walk(basedir):
33+
if exclude_vcs:
34+
# modify lists in-place, so os.walk does not descent into the
35+
# excluded entries. git submodules have a `.git` file.
36+
dirs[:] = [directory for directory in dirs if directory not in VCS_DIRS]
37+
files[:] = [filename for filename in files if filename not in VCS_DIRS]
38+
for directory in dirs:
39+
content.append(os.path.join(root, directory))
40+
for filename in files:
41+
content.append(os.path.join(root, filename))
42+
content.sort()
43+
return content
44+
45+
2746
def tar_reproducible(
2847
tar: tarfile.TarFile,
2948
basedir: pathlib.Path,
@@ -39,21 +58,37 @@ def tar_reproducible(
3958
If ``exclude_vcs`` is True, then Bazaar, git, Mercurial, and subversion
4059
directories and files are excluded.
4160
"""
42-
content = [str(basedir)] # convert from pathlib.Path, if that's what we have
43-
for root, dirs, files in os.walk(basedir):
44-
if exclude_vcs:
45-
# modify lists in-place, so os.walk does not descent into the
46-
# excluded entries. git submodules have a `.git` file.
47-
dirs[:] = [directory for directory in dirs if directory not in VCS_DIRS]
48-
files[:] = [filename for filename in files if filename not in VCS_DIRS]
49-
for directory in dirs:
50-
content.append(os.path.join(root, directory))
51-
for filename in files:
52-
content.append(os.path.join(root, filename))
53-
content.sort()
54-
61+
content = _tar_content(basedir=basedir, exclude_vcs=exclude_vcs)
5562
for fn in content:
5663
# Ensure that the paths in the tarfile are rooted at the prefix
5764
# directory, if we have one.
5865
arcname = fn if prefix is None else os.path.relpath(fn, prefix)
5966
tar.add(fn, filter=_tar_reset, recursive=False, arcname=arcname)
67+
68+
69+
def tar_reproducible_with_prefix(
70+
tar: tarfile.TarFile,
71+
basedir: pathlib.Path,
72+
prefix: str,
73+
*,
74+
exclude_vcs: bool = False,
75+
) -> None:
76+
"""Create reproducible tar file with a prefix
77+
78+
Add content from basedir to already opened tar. All archive names are
79+
relative to ``basedir`` and with ``prefix` prepended. The ``prefix``
80+
must be relative and can be ``.``. This is equivalent to
81+
``tar -czf $tarfile -C $basedir --transform 's,^,${prefix}/' .`` or
82+
``git archive --prefix ${prefix}/``.
83+
84+
If ``exclude_vcs`` is True, then Bazaar, git, Mercurial, and subversion
85+
directories and files are excluded.
86+
"""
87+
if os.sep in prefix:
88+
raise ValueError("prefix {prefix} cannot contain {os.sep}")
89+
content = _tar_content(basedir=basedir, exclude_vcs=exclude_vcs)
90+
for fn in content:
91+
# archive names are relative to basedir
92+
# prefix is prepended and path is normalized
93+
arcname = os.path.normpath(os.path.join(prefix, os.path.relpath(fn, basedir)))
94+
tar.add(fn, filter=_tar_reset, recursive=False, arcname=arcname)

tests/test_tarballs.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,19 @@ def test_vcs_exclude(tmp_path: pathlib.Path) -> None:
9393
with tarfile.open(t1, "r") as tf:
9494
names = tf.getnames()
9595
assert names == [str(p).lstrip(os.sep) for p in [root, root / "a"]]
96+
97+
98+
def test_tar_reproducible_with_prefix(tmp_path: pathlib.Path) -> None:
99+
root = tmp_path / "root"
100+
root.mkdir()
101+
subdir = root / "subdir"
102+
subdir.mkdir()
103+
a = subdir / "a"
104+
a.write_text("this is file a")
105+
106+
t1 = tmp_path / "out1.tar"
107+
with tarfile.open(t1, "w") as tf:
108+
tarballs.tar_reproducible_with_prefix(tar=tf, basedir=root, prefix="someprefix")
109+
with tarfile.open(t1, "r") as tf:
110+
names = tf.getnames()
111+
assert names == ["someprefix", "someprefix/subdir", "someprefix/subdir/a"]

0 commit comments

Comments
 (0)