From d3181b5fe3f6fb2e1050000aea499cc08e052769 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Tue, 31 Oct 2023 16:51:25 +1300 Subject: [PATCH] Don't push blobs with URLs to the remote Also makes lfs command output more standardized across the various commands. --- CHANGELOG.md | 1 + kart/lfs_commands/__init__.py | 78 +++++++++++++++++---------- tests/byod/test_imports.py | 24 ++++++++- tests/point_cloud/test_imports.py | 4 +- tests/point_cloud/test_workingcopy.py | 4 +- 5 files changed, 78 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e18eb5ae..a083d37c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where - Adds information on referencing and citing Kart to `CITATION`. [#914](https://github.com/koordinates/kart/pull/914) - Fixes a bug where Kart would misidentify a non-Kart repo as a Kart V1 repo in some circumstances. [#918](https://github.com/koordinates/kart/issues/918) - Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924) +- Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932) ## 0.14.2 diff --git a/kart/lfs_commands/__init__.py b/kart/lfs_commands/__init__.py index c620576b..6e4b191c 100644 --- a/kart/lfs_commands/__init__.py +++ b/kart/lfs_commands/__init__.py @@ -120,25 +120,34 @@ def pre_push(ctx, remote_name, remote_url, dry_run): start_commits, stop_commits = get_start_and_stop_commits(sys.stdin) - lfs_oids = set() + # This is a dicts {lfs_oid: file_size} - see _lfs_blobs() function: + to_push = {} + for commit_id, path_match_result, pointer_blob in rev_list_tile_pointer_files( repo, start_commits, [f"--remotes={remote_name}", *stop_commits] ): # Because of the way a Kart repo is laid out, we know that: # All LFS pointer files are blobs inside **/.*-dataset.v?/tile/** and conversely, # All blobs inside **/.*-dataset.v?/tile/** are LFS pointer files. - lfs_oids.add(get_hash_from_pointer_file(pointer_blob)) + pointer_dict = pointer_file_bytes_to_dict(pointer_blob) + if pointer_dict.get("url"): + # Currently, the rule is that we never push pointer files that contain a URL. + # If anyone - any clone of this repo - needs the blob, they can fetch it directly from the URL. + # We may decide to allow for more complicated flows in a later version of Kart. + continue + lfs_oid = get_hash_from_pointer_file(pointer_dict) + to_push[lfs_oid] = pointer_dict["size"] if dry_run: click.echo( - f"Running pre-push with --dry-run: pushing {len(lfs_oids)} LFS blobs" + f"Running pre-push with --dry-run: found {_lfs_blobs(to_push)} to push" ) - for lfs_oid in lfs_oids: + for lfs_oid in to_push: click.echo(lfs_oid) return - if lfs_oids: - push_lfs_oids(repo, remote_name, lfs_oids) + if to_push: + push_lfs_oids(repo, remote_name, to_push) def get_start_and_stop_commits(input_iter): @@ -282,10 +291,6 @@ def fetch_lfs_blobs_for_commits( ) -def _blobs(count): - return "1 blob" if count == 1 else f"{count} blobs" - - def fetch_lfs_blobs_for_pointer_files( repo, pointer_files, *, remote_name=None, dry_run=False, quiet=False ): @@ -303,6 +308,10 @@ def fetch_lfs_blobs_for_pointer_files( urls = [] non_urls = [] + # These are dicts {lfs_oid: file_size} - see _lfs_blobs() function: + urls_sizes = {} + non_urls_sizes = {} + for pointer_file in pointer_files: if isinstance(pointer_file, str): pointer_blob = repo[pointer_file] @@ -321,8 +330,10 @@ def fetch_lfs_blobs_for_pointer_files( if url: urls.append((url, lfs_oid)) + urls_sizes[lfs_oid] = pointer_dict["size"] else: non_urls.append((pointer_file_oid, lfs_oid)) + non_urls_sizes[lfs_oid] = pointer_dict["size"] if dry_run: if url: @@ -333,9 +344,11 @@ def fetch_lfs_blobs_for_pointer_files( if dry_run: click.echo("Running fetch with --dry-run:") if urls: - click.echo(f" Found {_blobs(len(urls))} to fetch from specific URLs") + click.echo(f" Found {_lfs_blobs(urls_sizes)} to fetch from specific URLs") if non_urls: - found_non_urls = f" Found {_blobs(len(non_urls))} to fetch from the remote" + found_non_urls = ( + f" Found {_lfs_blobs(non_urls_sizes)} to fetch from the remote" + ) found_non_urls += "" if remote_name else " - but no remote is configured" click.echo(found_non_urls) if not urls and not non_urls: @@ -411,7 +424,7 @@ def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=Fa @click.option( "--dry-run", is_flag=True, - help="Don't fetch anything, just show what would be fetched", + help="Don't garbage-collect anything, just show what would be garbage-collected", ) def gc(ctx, dry_run): """ @@ -440,11 +453,9 @@ def gc(ctx, dry_run): if dataset.path not in non_checkout_datasets: checked_out_lfs_oids.update(dataset.tile_lfs_hashes(spatial_filter)) - to_delete = set() - total_size_to_delete = 0 - - to_delete_once_pushed = set() - total_size_to_delete_once_pushed = 0 + # These are dicts {lfs_oid: file_size} - see _lfs_blobs() function. + to_delete = {} + to_delete_once_pushed = {} for file in (repo.gitdir_path / "lfs" / "objects").glob("**/*"): if not file.is_file() or not LFS_OID_PATTERN.fullmatch(file.name): @@ -454,32 +465,45 @@ def gc(ctx, dry_run): continue # Can't garbage-collect anything that's currently checked out. if file.name in unpushed_lfs_oids: - to_delete_once_pushed.add(file) - total_size_to_delete_once_pushed += file.stat().st_size + to_delete_once_pushed[file] = file.stat().st_size else: - to_delete.add(file) - total_size_to_delete += file.stat().st_size + to_delete[file] = file.stat().st_size if to_delete_once_pushed: - size_desc = human_readable_bytes(total_size_to_delete_once_pushed) click.echo( - f"Can't delete {len(to_delete_once_pushed)} LFS blobs ({size_desc}) from the cache since they have not been pushed to a remote" + f"Can't delete {_lfs_blobs(to_delete_once_pushed)} from the cache since they have not been pushed to a remote" ) - size_desc = human_readable_bytes(total_size_to_delete) if dry_run: click.echo( - f"Running gc with --dry-run: found {len(to_delete)} LFS blobs ({size_desc}) to delete from the cache" + f"Running gc with --dry-run: found {_lfs_blobs(to_delete)} to delete from the cache" ) for file in sorted(to_delete, key=lambda f: f.name): click.echo(file.name) return - click.echo(f"Deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache...") + click.echo(f"Deleting {_lfs_blobs(to_delete)} from the cache...") for file in to_delete: file.unlink() +def _lfs_blobs(file_size_dict): + """ + Returns a string looking something like "5 LFS blobs (1MiB)". + Takes a dict of the form {lfs_oid: file_size_in_bytes}, where the length of the dict is the count of unique blobs. + This is because building a dict like this is a straight-forward way of getting a unique set of OIDs along + with a way of finding their total size; maintaining two separate variables - a set of OIDS and a total size - + makes the code more complicated. + """ + + count = len(file_size_dict) + total_size = sum(file_size_dict.values()) + + blobs = "blob" if count == 1 else "blobs" + size_desc = human_readable_bytes(total_size) + return f"{count} LFS {blobs} ({size_desc})" + + def human_readable_bytes(num): for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"): if num < 1024: diff --git a/tests/byod/test_imports.py b/tests/byod/test_imports.py index e42cdaa0..66a070b3 100644 --- a/tests/byod/test_imports.py +++ b/tests/byod/test_imports.py @@ -1,11 +1,15 @@ import json import os +import subprocess import pytest from kart.repo import KartRepo +DUMMY_REPO = "git@example.com/example.git" + + @pytest.mark.slow def test_byod_point_cloud_import( tmp_path, @@ -101,7 +105,7 @@ def test_byod_point_cloud_import( assert r.exit_code == 0, r.stderr assert r.stdout.splitlines()[:6] == [ "Running fetch with --dry-run:", - " Found 16 blobs to fetch from specific URLs", + " Found 16 LFS blobs (373KiB) to fetch from specific URLs", "", "LFS blob OID: (Pointer file OID):", "03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (ecb9c281c7e8cc354600d41e88d733faf2e991e1) → s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/auckland_3_2.laz", @@ -215,7 +219,7 @@ def test_byod_raster_import( assert r.exit_code == 0, r.stderr assert r.stdout.splitlines() == [ "Running fetch with --dry-run:", - " Found 2 blobs to fetch from specific URLs", + " Found 2 LFS blobs (627KiB) to fetch from specific URLs", "", "LFS blob OID: (Pointer file OID):", "c4bbea4d7cfd54f4cdbca887a1b358a81710e820a6aed97cdf3337fd3e14f5aa (6864fc3291a79b2ce9e4c89004172aa698b84d7c) → s3://kart-bring-your-own-data-poc/erorisk_si/erorisk_silcdb4.tif", @@ -237,3 +241,19 @@ def test_byod_raster_import( "Running fetch with --dry-run:", " Found nothing to fetch", ] + + # Make sure LFS blobs with URLs are not pushed to the remote: + r = cli_runner.invoke(["remote", "add", "origin", DUMMY_REPO]) + assert r.exit_code == 0, r.stderr + repo.config[f"lfs.{DUMMY_REPO}/info/lfs.locksverify"] = False + + head_sha = repo.head_commit.hex + stdout = subprocess.check_output( + ["kart", "lfs+", "pre-push", "origin", "DUMMY_REPO", "--dry-run"], + input=f"main {head_sha} main 0000000000000000000000000000000000000000\n", + encoding="utf8", + ) + assert ( + stdout.splitlines()[0] + == "Running pre-push with --dry-run: found 0 LFS blobs (0B) to push" + ) diff --git a/tests/point_cloud/test_imports.py b/tests/point_cloud/test_imports.py index aafcb730..6e8ed7b4 100644 --- a/tests/point_cloud/test_imports.py +++ b/tests/point_cloud/test_imports.py @@ -174,7 +174,7 @@ def test_import_single_las__convert( ) assert ( stdout.splitlines()[0] - == "Running pre-push with --dry-run: pushing 1 LFS blobs" + == "Running pre-push with --dry-run: found 1 LFS blob (3.5KiB) to push" ) assert (repo_path / "autzen" / "autzen.copc.laz").is_file() @@ -316,7 +316,7 @@ def test_import_several_laz__convert( ) assert ( stdout.splitlines()[0] - == "Running pre-push with --dry-run: pushing 16 LFS blobs" + == "Running pre-push with --dry-run: found 16 LFS blobs (410KiB) to push" ) for x in range(4): diff --git a/tests/point_cloud/test_workingcopy.py b/tests/point_cloud/test_workingcopy.py index 18d98385..275a1234 100644 --- a/tests/point_cloud/test_workingcopy.py +++ b/tests/point_cloud/test_workingcopy.py @@ -1102,7 +1102,7 @@ def test_lfs_fetch(cli_runner, data_archive): assert r.exit_code == 0, r.stderr assert r.stdout.splitlines() == [ "Running fetch with --dry-run:", - " Found 16 blobs to fetch from the remote", + " Found 16 LFS blobs (410KiB) to fetch from the remote", "", "LFS blob OID: (Pointer file OID):", "0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)", @@ -1179,7 +1179,7 @@ def test_lfs_gc(cli_runner, data_archive, monkeypatch): assert r.exit_code == 0, r.stderr assert r.stdout.splitlines() == [ "Running fetch with --dry-run:", - " Found 4 blobs to fetch from the remote", + " Found 4 LFS blobs (82KiB) to fetch from the remote", "", "LFS blob OID: (Pointer file OID):", "0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",