Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't push blobs with URLs to the remote #932

Merged
merged 1 commit into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Adds information on referencing and citing Kart to `CITATION`. [#914](https://github.com/koordinates/kart/pull/914)
- Fixes a bug where Kart would misidentify a non-Kart repo as a Kart V1 repo in some circumstances. [#918](https://github.com/koordinates/kart/issues/918)
- Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924)
- Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932)

## 0.14.2

Expand Down
78 changes: 51 additions & 27 deletions kart/lfs_commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,25 +120,34 @@ def pre_push(ctx, remote_name, remote_url, dry_run):

start_commits, stop_commits = get_start_and_stop_commits(sys.stdin)

lfs_oids = set()
# This is a dicts {lfs_oid: file_size} - see _lfs_blobs() function:
to_push = {}

for commit_id, path_match_result, pointer_blob in rev_list_tile_pointer_files(
repo, start_commits, [f"--remotes={remote_name}", *stop_commits]
):
# Because of the way a Kart repo is laid out, we know that:
# All LFS pointer files are blobs inside **/.*-dataset.v?/tile/** and conversely,
# All blobs inside **/.*-dataset.v?/tile/** are LFS pointer files.
lfs_oids.add(get_hash_from_pointer_file(pointer_blob))
pointer_dict = pointer_file_bytes_to_dict(pointer_blob)
if pointer_dict.get("url"):
# Currently, the rule is that we never push pointer files that contain a URL.
# If anyone - any clone of this repo - needs the blob, they can fetch it directly from the URL.
# We may decide to allow for more complicated flows in a later version of Kart.
continue
lfs_oid = get_hash_from_pointer_file(pointer_dict)
to_push[lfs_oid] = pointer_dict["size"]

if dry_run:
click.echo(
f"Running pre-push with --dry-run: pushing {len(lfs_oids)} LFS blobs"
f"Running pre-push with --dry-run: found {_lfs_blobs(to_push)} to push"
)
for lfs_oid in lfs_oids:
for lfs_oid in to_push:
click.echo(lfs_oid)
return

if lfs_oids:
push_lfs_oids(repo, remote_name, lfs_oids)
if to_push:
push_lfs_oids(repo, remote_name, to_push)


def get_start_and_stop_commits(input_iter):
Expand Down Expand Up @@ -282,10 +291,6 @@ def fetch_lfs_blobs_for_commits(
)


def _blobs(count):
return "1 blob" if count == 1 else f"{count} blobs"


def fetch_lfs_blobs_for_pointer_files(
repo, pointer_files, *, remote_name=None, dry_run=False, quiet=False
):
Expand All @@ -303,6 +308,10 @@ def fetch_lfs_blobs_for_pointer_files(
urls = []
non_urls = []

# These are dicts {lfs_oid: file_size} - see _lfs_blobs() function:
urls_sizes = {}
non_urls_sizes = {}

for pointer_file in pointer_files:
if isinstance(pointer_file, str):
pointer_blob = repo[pointer_file]
Expand All @@ -321,8 +330,10 @@ def fetch_lfs_blobs_for_pointer_files(

if url:
urls.append((url, lfs_oid))
urls_sizes[lfs_oid] = pointer_dict["size"]
else:
non_urls.append((pointer_file_oid, lfs_oid))
non_urls_sizes[lfs_oid] = pointer_dict["size"]

if dry_run:
if url:
Expand All @@ -333,9 +344,11 @@ def fetch_lfs_blobs_for_pointer_files(
if dry_run:
click.echo("Running fetch with --dry-run:")
if urls:
click.echo(f" Found {_blobs(len(urls))} to fetch from specific URLs")
click.echo(f" Found {_lfs_blobs(urls_sizes)} to fetch from specific URLs")
if non_urls:
found_non_urls = f" Found {_blobs(len(non_urls))} to fetch from the remote"
found_non_urls = (
f" Found {_lfs_blobs(non_urls_sizes)} to fetch from the remote"
)
found_non_urls += "" if remote_name else " - but no remote is configured"
click.echo(found_non_urls)
if not urls and not non_urls:
Expand Down Expand Up @@ -411,7 +424,7 @@ def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=Fa
@click.option(
"--dry-run",
is_flag=True,
help="Don't fetch anything, just show what would be fetched",
help="Don't garbage-collect anything, just show what would be garbage-collected",
)
def gc(ctx, dry_run):
"""
Expand Down Expand Up @@ -440,11 +453,9 @@ def gc(ctx, dry_run):
if dataset.path not in non_checkout_datasets:
checked_out_lfs_oids.update(dataset.tile_lfs_hashes(spatial_filter))

to_delete = set()
total_size_to_delete = 0

to_delete_once_pushed = set()
total_size_to_delete_once_pushed = 0
# These are dicts {lfs_oid: file_size} - see _lfs_blobs() function.
to_delete = {}
to_delete_once_pushed = {}

for file in (repo.gitdir_path / "lfs" / "objects").glob("**/*"):
if not file.is_file() or not LFS_OID_PATTERN.fullmatch(file.name):
Expand All @@ -454,32 +465,45 @@ def gc(ctx, dry_run):
continue # Can't garbage-collect anything that's currently checked out.

if file.name in unpushed_lfs_oids:
to_delete_once_pushed.add(file)
total_size_to_delete_once_pushed += file.stat().st_size
to_delete_once_pushed[file] = file.stat().st_size
else:
to_delete.add(file)
total_size_to_delete += file.stat().st_size
to_delete[file] = file.stat().st_size

if to_delete_once_pushed:
size_desc = human_readable_bytes(total_size_to_delete_once_pushed)
click.echo(
f"Can't delete {len(to_delete_once_pushed)} LFS blobs ({size_desc}) from the cache since they have not been pushed to a remote"
f"Can't delete {_lfs_blobs(to_delete_once_pushed)} from the cache since they have not been pushed to a remote"
)

size_desc = human_readable_bytes(total_size_to_delete)
if dry_run:
click.echo(
f"Running gc with --dry-run: found {len(to_delete)} LFS blobs ({size_desc}) to delete from the cache"
f"Running gc with --dry-run: found {_lfs_blobs(to_delete)} to delete from the cache"
)
for file in sorted(to_delete, key=lambda f: f.name):
click.echo(file.name)
return

click.echo(f"Deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache...")
click.echo(f"Deleting {_lfs_blobs(to_delete)} from the cache...")
for file in to_delete:
file.unlink()


def _lfs_blobs(file_size_dict):
"""
Returns a string looking something like "5 LFS blobs (1MiB)".
Takes a dict of the form {lfs_oid: file_size_in_bytes}, where the length of the dict is the count of unique blobs.
This is because building a dict like this is a straight-forward way of getting a unique set of OIDs along
with a way of finding their total size; maintaining two separate variables - a set of OIDS and a total size -
makes the code more complicated.
"""

count = len(file_size_dict)
total_size = sum(file_size_dict.values())

blobs = "blob" if count == 1 else "blobs"
size_desc = human_readable_bytes(total_size)
return f"{count} LFS {blobs} ({size_desc})"


def human_readable_bytes(num):
for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
if num < 1024:
Expand Down
24 changes: 22 additions & 2 deletions tests/byod/test_imports.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import json
import os
import subprocess

import pytest

from kart.repo import KartRepo


DUMMY_REPO = "git@example.com/example.git"


@pytest.mark.slow
def test_byod_point_cloud_import(
tmp_path,
Expand Down Expand Up @@ -101,7 +105,7 @@ def test_byod_point_cloud_import(
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines()[:6] == [
"Running fetch with --dry-run:",
" Found 16 blobs to fetch from specific URLs",
" Found 16 LFS blobs (373KiB) to fetch from specific URLs",
"",
"LFS blob OID: (Pointer file OID):",
"03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (ecb9c281c7e8cc354600d41e88d733faf2e991e1) → s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/auckland_3_2.laz",
Expand Down Expand Up @@ -215,7 +219,7 @@ def test_byod_raster_import(
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 2 blobs to fetch from specific URLs",
" Found 2 LFS blobs (627KiB) to fetch from specific URLs",
"",
"LFS blob OID: (Pointer file OID):",
"c4bbea4d7cfd54f4cdbca887a1b358a81710e820a6aed97cdf3337fd3e14f5aa (6864fc3291a79b2ce9e4c89004172aa698b84d7c) → s3://kart-bring-your-own-data-poc/erorisk_si/erorisk_silcdb4.tif",
Expand All @@ -237,3 +241,19 @@ def test_byod_raster_import(
"Running fetch with --dry-run:",
" Found nothing to fetch",
]

# Make sure LFS blobs with URLs are not pushed to the remote:
r = cli_runner.invoke(["remote", "add", "origin", DUMMY_REPO])
assert r.exit_code == 0, r.stderr
repo.config[f"lfs.{DUMMY_REPO}/info/lfs.locksverify"] = False

head_sha = repo.head_commit.hex
stdout = subprocess.check_output(
["kart", "lfs+", "pre-push", "origin", "DUMMY_REPO", "--dry-run"],
input=f"main {head_sha} main 0000000000000000000000000000000000000000\n",
encoding="utf8",
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: found 0 LFS blobs (0B) to push"
)
4 changes: 2 additions & 2 deletions tests/point_cloud/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_import_single_las__convert(
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: pushing 1 LFS blobs"
== "Running pre-push with --dry-run: found 1 LFS blob (3.5KiB) to push"
)

assert (repo_path / "autzen" / "autzen.copc.laz").is_file()
Expand Down Expand Up @@ -316,7 +316,7 @@ def test_import_several_laz__convert(
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: pushing 16 LFS blobs"
== "Running pre-push with --dry-run: found 16 LFS blobs (410KiB) to push"
)

for x in range(4):
Expand Down
4 changes: 2 additions & 2 deletions tests/point_cloud/test_workingcopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ def test_lfs_fetch(cli_runner, data_archive):
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 16 blobs to fetch from the remote",
" Found 16 LFS blobs (410KiB) to fetch from the remote",
"",
"LFS blob OID: (Pointer file OID):",
"0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",
Expand Down Expand Up @@ -1179,7 +1179,7 @@ def test_lfs_gc(cli_runner, data_archive, monkeypatch):
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 4 blobs to fetch from the remote",
" Found 4 LFS blobs (82KiB) to fetch from the remote",
"",
"LFS blob OID: (Pointer file OID):",
"0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",
Expand Down