From 24a11ceb025172dcdca812df702c17104b605618 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Tue, 10 Oct 2023 17:52:35 +0200 Subject: [PATCH] Docker adaptor can retrieve an image from the local docker service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes: #199 Demo ``` ❯ datalad create busydemo ❯ cd busydemo ❯ datalad containers-add -u dhub://busybox:latest busy ❯ datalad drop .datalad/environments --reckless availability ❯ git annex info | grep 'local annex' local annex keys: 0 local annex size: 0 bytes ❯ cat .datalad/config [datalad "dataset"] id = b7adee52-a65a-43fc-a85b-c0d5e2d5b67c [datalad "containers.busy"] image = .datalad/environments/busy/image cmdexec = {python} -m datalad_container.adapters.docker run {img} {cmd} ❯ datalad containers-run -n busy uname [INFO ] Saved busybox:latest to /tmp/busydemo/.datalad/environments/busy/image/tmpzrdd7mj2 [INFO ] Making sure inputs are available (this may take some time) [INFO ] == Command start (output follows) ===== Linux [INFO ] == Command exit (modification check follows) ===== run(ok): /tmp/busydemo (dataset) [/home/mih/env/datalad-dev/bin/python -m ...] ``` --- datalad_container/adapters/docker.py | 90 ++++++++++++++++++++++++++++ datalad_container/containers_run.py | 29 +++++++++ 2 files changed, 119 insertions(+) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 89498527..075fdc8a 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -13,6 +13,7 @@ import json import os import os.path as op +from pathlib import Path import subprocess as sp import sys import tarfile @@ -88,6 +89,14 @@ def _list_images(): return out.decode().splitlines() +def _get_repotag_from_image_sha256(sha): + out = sp.check_output( + ['docker', 'image', 'inspect', '--format', + '{{range $v := .RepoTags}}{{$v}} {{end}}', + sha]) + return out.decode().splitlines()[0].strip() + + def get_image(path, repo_tag=None, config=None): """Return the image ID of the image extracted at `path`. """ @@ -153,6 +162,87 @@ def load(path, repo_tag, config): return image_id +def repopulate_from_daemon(contds, imgpath: Path) -> None: + # crude check whether anything at the image location is not + # locally present + contrepo = contds.repo + if not contrepo.call_annex( + ['find', '--not', '--in', 'here'], + files=str(imgpath), + ): + # nothing is missing, we have nothing to do here + return + + # a docker image is a collection of files in a directory + assert imgpath.is_dir() + # we could look into `manifest.json`, but it might also be + # annexed and not around. instead look for the config filename + imgcfg = [ + p.name for p in imgpath.iterdir() + # a sha256 is 64 chars plus '.json' + if len(p.name) == 69 and p.name.endswith('.json') + ] + # there is only one + assert len(imgcfg) == 1 + + # look for the employed annex backend, we need it for key reinject below + backends = set(contrepo.call_annex_oneline([ + 'find', + f'--branch=HEAD:{imgpath.relative_to(contds.pathobj)}', + # this needs git-annex 10.20230126 or later + '--anything', + # the trailing space is not a mistake! + '--format=${backend} ', + ]).split()) + # we can only deal with a single homogeneous backend here + assert len(backends) == 1 + + # ID is filename, minus .json extension + img_id = imgcfg[0][:-5] + + # make an effort to get the repotags matching the image sha256 + # from docker. This is needed, because the query tag will end up + # in manifest.json, and the original addition was likely via a tag + # and not a sha256 + repo_tag = None + try: + repo_tag = _get_repotag_from_image_sha256(img_id) + except Exception: + # however, we will go on without a tag. In the worst case, it + # would trigger a download of manifest.json (tiny file), but + # the large `layer.tar` will still be successfully extracted + # and reinject via a query by ID/sha256 + pass + + # let docker dump into a TMPDIR inside the dataset + # this place is likely to have sufficient space + with tempfile.TemporaryDirectory(dir=imgpath) as tmpdir: + # try to export the image from a local docker instance + save( + # prefer the tag, but continue with ID (see above) + repo_tag or f'sha256:{img_id}', + tmpdir, + ) + # the line above will raise an exception when + # - this docker does not have the image. + # - or there is not docker running at all. + # this is fine, we will just not proceed. + + # now let git-annex reinject any file that matches a known + # key (given the backend determined above). This will populate + # as much as we can. This approach has built-in content verification. + # this means that even if this docker instance has different metadata + # we will be able to harvest any image piece that fits, and ignore + # anything else + contrepo.call_annex( + ['reinject', '--known', '--backend', backends.pop()], + files=[ + str(p) for p in Path(tmpdir).glob('**/*') + if p.is_file() + ], + ) + + # Command-line diff --git a/datalad_container/containers_run.py b/datalad_container/containers_run.py index d7b4e796..f51dcfa9 100644 --- a/datalad_container/containers_run.py +++ b/datalad_container/containers_run.py @@ -4,10 +4,12 @@ import logging import os.path as op +from pathlib import Path import sys from datalad.interface.base import Interface from datalad.interface.base import build_doc +from datalad.support.exceptions import CapturedException from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.distribution.dataset import require_dataset @@ -163,6 +165,33 @@ def __call__(cmd, container_name=None, dataset=None, lgr.debug("extra_inputs = %r", extra_inputs) + if '-m datalad_container.adapters.docker run' in cmd: + # this will use the docker adapter to execute the container. + # below we let the adaptor have a first look at the image + # it will run. The adaptor might query a local docker service, + # and try to populate missing image parts -- possibly avoiding + # a download (via the `get()` that `run()` would perform), whenever + # the local service already has the respective images. + # this is a scenario that would occur frequently in short-lived + # clones that are repeatedly generated on the same machine. + from datalad_container.adapters.docker import repopulate_from_daemon + contds = require_dataset( + container['parentds'], check_installed=True, + purpose='check for docker images') + try: + repopulate_from_daemon( + contds, + # we use the container report here too, and not any of the + # processed variants from above to stay internally + # consistent + imgpath=Path(container['path']), + ) + except Exception as e: + # get basic logging of a failure, but overall consider this + # a "best effort". if anything fails, we will silently fall + # back on a standard "get" via the `extra_inputs` below + CapturedException(e) + with patch.dict('os.environ', {CONTAINER_NAME_ENVVAR: container['name']}): # fire!