From d3181b5fe3f6fb2e1050000aea499cc08e052769 Mon Sep 17 00:00:00 2001
From: Andrew Olsen <andrew.olsen@koordinates.com>
Date: Tue, 31 Oct 2023 16:51:25 +1300
Subject: [PATCH] Don't push blobs with URLs to the remote

Also makes lfs command output more standardized across
the various commands.
---
 CHANGELOG.md                          |  1 +
 kart/lfs_commands/__init__.py         | 78 +++++++++++++++++----------
 tests/byod/test_imports.py            | 24 ++++++++-
 tests/point_cloud/test_imports.py     |  4 +-
 tests/point_cloud/test_workingcopy.py |  4 +-
 5 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e18eb5ae..a083d37c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
 - Adds information on referencing and citing Kart to `CITATION`. [#914](https://github.com/koordinates/kart/pull/914)
 - Fixes a bug where Kart would misidentify a non-Kart repo as a Kart V1 repo in some circumstances. [#918](https://github.com/koordinates/kart/issues/918)
 - Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924)
+- Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932)
 
 ## 0.14.2
 
diff --git a/kart/lfs_commands/__init__.py b/kart/lfs_commands/__init__.py
index c620576b..6e4b191c 100644
--- a/kart/lfs_commands/__init__.py
+++ b/kart/lfs_commands/__init__.py
@@ -120,25 +120,34 @@ def pre_push(ctx, remote_name, remote_url, dry_run):
 
     start_commits, stop_commits = get_start_and_stop_commits(sys.stdin)
 
-    lfs_oids = set()
+    # This is a dicts {lfs_oid: file_size} - see _lfs_blobs() function:
+    to_push = {}
+
     for commit_id, path_match_result, pointer_blob in rev_list_tile_pointer_files(
         repo, start_commits, [f"--remotes={remote_name}", *stop_commits]
     ):
         # Because of the way a Kart repo is laid out, we know that:
         # All LFS pointer files are blobs inside **/.*-dataset.v?/tile/** and conversely,
         # All blobs inside **/.*-dataset.v?/tile/** are LFS pointer files.
-        lfs_oids.add(get_hash_from_pointer_file(pointer_blob))
+        pointer_dict = pointer_file_bytes_to_dict(pointer_blob)
+        if pointer_dict.get("url"):
+            # Currently, the rule is that we never push pointer files that contain a URL.
+            # If anyone - any clone of this repo - needs the blob, they can fetch it directly from the URL.
+            # We may decide to allow for more complicated flows in a later version of Kart.
+            continue
+        lfs_oid = get_hash_from_pointer_file(pointer_dict)
+        to_push[lfs_oid] = pointer_dict["size"]
 
     if dry_run:
         click.echo(
-            f"Running pre-push with --dry-run: pushing {len(lfs_oids)} LFS blobs"
+            f"Running pre-push with --dry-run: found {_lfs_blobs(to_push)} to push"
         )
-        for lfs_oid in lfs_oids:
+        for lfs_oid in to_push:
             click.echo(lfs_oid)
         return
 
-    if lfs_oids:
-        push_lfs_oids(repo, remote_name, lfs_oids)
+    if to_push:
+        push_lfs_oids(repo, remote_name, to_push)
 
 
 def get_start_and_stop_commits(input_iter):
@@ -282,10 +291,6 @@ def fetch_lfs_blobs_for_commits(
     )
 
 
-def _blobs(count):
-    return "1 blob" if count == 1 else f"{count} blobs"
-
-
 def fetch_lfs_blobs_for_pointer_files(
     repo, pointer_files, *, remote_name=None, dry_run=False, quiet=False
 ):
@@ -303,6 +308,10 @@ def fetch_lfs_blobs_for_pointer_files(
     urls = []
     non_urls = []
 
+    # These are dicts {lfs_oid: file_size} - see _lfs_blobs() function:
+    urls_sizes = {}
+    non_urls_sizes = {}
+
     for pointer_file in pointer_files:
         if isinstance(pointer_file, str):
             pointer_blob = repo[pointer_file]
@@ -321,8 +330,10 @@ def fetch_lfs_blobs_for_pointer_files(
 
         if url:
             urls.append((url, lfs_oid))
+            urls_sizes[lfs_oid] = pointer_dict["size"]
         else:
             non_urls.append((pointer_file_oid, lfs_oid))
+            non_urls_sizes[lfs_oid] = pointer_dict["size"]
 
         if dry_run:
             if url:
@@ -333,9 +344,11 @@ def fetch_lfs_blobs_for_pointer_files(
     if dry_run:
         click.echo("Running fetch with --dry-run:")
         if urls:
-            click.echo(f"  Found {_blobs(len(urls))} to fetch from specific URLs")
+            click.echo(f"  Found {_lfs_blobs(urls_sizes)} to fetch from specific URLs")
         if non_urls:
-            found_non_urls = f"  Found {_blobs(len(non_urls))} to fetch from the remote"
+            found_non_urls = (
+                f"  Found {_lfs_blobs(non_urls_sizes)} to fetch from the remote"
+            )
             found_non_urls += "" if remote_name else " - but no remote is configured"
             click.echo(found_non_urls)
         if not urls and not non_urls:
@@ -411,7 +424,7 @@ def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=Fa
 @click.option(
     "--dry-run",
     is_flag=True,
-    help="Don't fetch anything, just show what would be fetched",
+    help="Don't garbage-collect anything, just show what would be garbage-collected",
 )
 def gc(ctx, dry_run):
     """
@@ -440,11 +453,9 @@ def gc(ctx, dry_run):
         if dataset.path not in non_checkout_datasets:
             checked_out_lfs_oids.update(dataset.tile_lfs_hashes(spatial_filter))
 
-    to_delete = set()
-    total_size_to_delete = 0
-
-    to_delete_once_pushed = set()
-    total_size_to_delete_once_pushed = 0
+    # These are dicts {lfs_oid: file_size} - see _lfs_blobs() function.
+    to_delete = {}
+    to_delete_once_pushed = {}
 
     for file in (repo.gitdir_path / "lfs" / "objects").glob("**/*"):
         if not file.is_file() or not LFS_OID_PATTERN.fullmatch(file.name):
@@ -454,32 +465,45 @@ def gc(ctx, dry_run):
             continue  # Can't garbage-collect anything that's currently checked out.
 
         if file.name in unpushed_lfs_oids:
-            to_delete_once_pushed.add(file)
-            total_size_to_delete_once_pushed += file.stat().st_size
+            to_delete_once_pushed[file] = file.stat().st_size
         else:
-            to_delete.add(file)
-            total_size_to_delete += file.stat().st_size
+            to_delete[file] = file.stat().st_size
 
     if to_delete_once_pushed:
-        size_desc = human_readable_bytes(total_size_to_delete_once_pushed)
         click.echo(
-            f"Can't delete {len(to_delete_once_pushed)} LFS blobs ({size_desc}) from the cache since they have not been pushed to a remote"
+            f"Can't delete {_lfs_blobs(to_delete_once_pushed)} from the cache since they have not been pushed to a remote"
         )
 
-    size_desc = human_readable_bytes(total_size_to_delete)
     if dry_run:
         click.echo(
-            f"Running gc with --dry-run: found {len(to_delete)} LFS blobs ({size_desc}) to delete from the cache"
+            f"Running gc with --dry-run: found {_lfs_blobs(to_delete)} to delete from the cache"
         )
         for file in sorted(to_delete, key=lambda f: f.name):
             click.echo(file.name)
         return
 
-    click.echo(f"Deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache...")
+    click.echo(f"Deleting {_lfs_blobs(to_delete)} from the cache...")
     for file in to_delete:
         file.unlink()
 
 
+def _lfs_blobs(file_size_dict):
+    """
+    Returns a string looking something like "5 LFS blobs (1MiB)".
+    Takes a dict of the form {lfs_oid: file_size_in_bytes}, where the length of the dict is the count of unique blobs.
+    This is because building a dict like this is a straight-forward way of getting a unique set of OIDs along
+    with a way of finding their total size; maintaining two separate variables - a set of OIDS and a total size -
+    makes the code more complicated.
+    """
+
+    count = len(file_size_dict)
+    total_size = sum(file_size_dict.values())
+
+    blobs = "blob" if count == 1 else "blobs"
+    size_desc = human_readable_bytes(total_size)
+    return f"{count} LFS {blobs} ({size_desc})"
+
+
 def human_readable_bytes(num):
     for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
         if num < 1024:
diff --git a/tests/byod/test_imports.py b/tests/byod/test_imports.py
index e42cdaa0..66a070b3 100644
--- a/tests/byod/test_imports.py
+++ b/tests/byod/test_imports.py
@@ -1,11 +1,15 @@
 import json
 import os
+import subprocess
 
 import pytest
 
 from kart.repo import KartRepo
 
 
+DUMMY_REPO = "git@example.com/example.git"
+
+
 @pytest.mark.slow
 def test_byod_point_cloud_import(
     tmp_path,
@@ -101,7 +105,7 @@ def test_byod_point_cloud_import(
         assert r.exit_code == 0, r.stderr
         assert r.stdout.splitlines()[:6] == [
             "Running fetch with --dry-run:",
-            "  Found 16 blobs to fetch from specific URLs",
+            "  Found 16 LFS blobs (373KiB) to fetch from specific URLs",
             "",
             "LFS blob OID:                                                    (Pointer file OID):",
             "03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (ecb9c281c7e8cc354600d41e88d733faf2e991e1) → s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/auckland_3_2.laz",
@@ -215,7 +219,7 @@ def test_byod_raster_import(
         assert r.exit_code == 0, r.stderr
         assert r.stdout.splitlines() == [
             "Running fetch with --dry-run:",
-            "  Found 2 blobs to fetch from specific URLs",
+            "  Found 2 LFS blobs (627KiB) to fetch from specific URLs",
             "",
             "LFS blob OID:                                                    (Pointer file OID):",
             "c4bbea4d7cfd54f4cdbca887a1b358a81710e820a6aed97cdf3337fd3e14f5aa (6864fc3291a79b2ce9e4c89004172aa698b84d7c) → s3://kart-bring-your-own-data-poc/erorisk_si/erorisk_silcdb4.tif",
@@ -237,3 +241,19 @@ def test_byod_raster_import(
             "Running fetch with --dry-run:",
             "  Found nothing to fetch",
         ]
+
+        # Make sure LFS blobs with URLs are not pushed to the remote:
+        r = cli_runner.invoke(["remote", "add", "origin", DUMMY_REPO])
+        assert r.exit_code == 0, r.stderr
+        repo.config[f"lfs.{DUMMY_REPO}/info/lfs.locksverify"] = False
+
+        head_sha = repo.head_commit.hex
+        stdout = subprocess.check_output(
+            ["kart", "lfs+", "pre-push", "origin", "DUMMY_REPO", "--dry-run"],
+            input=f"main {head_sha} main 0000000000000000000000000000000000000000\n",
+            encoding="utf8",
+        )
+    assert (
+        stdout.splitlines()[0]
+        == "Running pre-push with --dry-run: found 0 LFS blobs (0B) to push"
+    )
diff --git a/tests/point_cloud/test_imports.py b/tests/point_cloud/test_imports.py
index aafcb730..6e8ed7b4 100644
--- a/tests/point_cloud/test_imports.py
+++ b/tests/point_cloud/test_imports.py
@@ -174,7 +174,7 @@ def test_import_single_las__convert(
             )
             assert (
                 stdout.splitlines()[0]
-                == "Running pre-push with --dry-run: pushing 1 LFS blobs"
+                == "Running pre-push with --dry-run: found 1 LFS blob (3.5KiB) to push"
             )
 
             assert (repo_path / "autzen" / "autzen.copc.laz").is_file()
@@ -316,7 +316,7 @@ def test_import_several_laz__convert(
             )
             assert (
                 stdout.splitlines()[0]
-                == "Running pre-push with --dry-run: pushing 16 LFS blobs"
+                == "Running pre-push with --dry-run: found 16 LFS blobs (410KiB) to push"
             )
 
             for x in range(4):
diff --git a/tests/point_cloud/test_workingcopy.py b/tests/point_cloud/test_workingcopy.py
index 18d98385..275a1234 100644
--- a/tests/point_cloud/test_workingcopy.py
+++ b/tests/point_cloud/test_workingcopy.py
@@ -1102,7 +1102,7 @@ def test_lfs_fetch(cli_runner, data_archive):
         assert r.exit_code == 0, r.stderr
         assert r.stdout.splitlines() == [
             "Running fetch with --dry-run:",
-            "  Found 16 blobs to fetch from the remote",
+            "  Found 16 LFS blobs (410KiB) to fetch from the remote",
             "",
             "LFS blob OID:                                                    (Pointer file OID):",
             "0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",
@@ -1179,7 +1179,7 @@ def test_lfs_gc(cli_runner, data_archive, monkeypatch):
         assert r.exit_code == 0, r.stderr
         assert r.stdout.splitlines() == [
             "Running fetch with --dry-run:",
-            "  Found 4 blobs to fetch from the remote",
+            "  Found 4 LFS blobs (82KiB) to fetch from the remote",
             "",
             "LFS blob OID:                                                    (Pointer file OID):",
             "0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",