Fetch S3 tiles: address review comments

koordinates · Oct 12, 2023 · 2edd89c · 2edd89c
1 parent 1adac79
commit 2edd89c
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 17 deletions.
diff --git a/kart/lfs_commands/__init__.py b/kart/lfs_commands/__init__.py
@@ -366,13 +366,14 @@ def _do_fetch_from_urls(repo, urls_and_lfs_oids, quiet=False):
             f"Invalid URL - only S3 URLs are currently supported for BYOD repos: {non_s3_url}"
         )
 
-    urls_and_paths = [
-        (url, get_local_path_from_lfs_hash(repo, lfs_oid))
+    urls_and_paths_and_oids = [
+        (url, get_local_path_from_lfs_hash(repo, lfs_oid), lfs_oid)
         for (url, lfs_oid) in urls_and_lfs_oids
     ]
-    for url, path in urls_and_paths:
-        path.parent.mkdir(parents=True, exist_ok=True)
-    fetch_multiple_from_s3(urls_and_paths, quiet=quiet)
+    path_parents = {path.parent for url, path, lfs_oid in urls_and_paths_and_oids}
+    for path_parent in path_parents:
+        path_parent.mkdir(parents=True, exist_ok=True)
+    fetch_multiple_from_s3(urls_and_paths_and_oids, quiet=quiet)
 
 
 def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=False):

diff --git a/kart/point_cloud/metadata_util.py b/kart/point_cloud/metadata_util.py
@@ -159,11 +159,16 @@ def extract_pc_tile_metadata(pc_tile_path, oid_and_size=None):
     same dataset to be homogenous enough that the meta items format.json, schema.json and crs.wkt
     describe *all* of the tiles in that dataset. The "tile" field is where we keep all information
     that can be different for every tile in the dataset, which is why it must be stored in pointer files.
+
+    pc_tile_path - a pathlib.Path or a string containing the path to a file or an S3 url.
+    oid_and_size - a tuple (sha256_oid, filesize) if already known, to avoid repeated work.
     """
+    pc_tile_path = str(pc_tile_path)
+
     pipeline = [
         {
             "type": "readers.las",
-            "filename": str(pc_tile_path),
+            "filename": pc_tile_path,
             "count": 0,  # Don't read any individual points.
         },
         {"type": "filters.info"},
@@ -198,7 +203,7 @@ def extract_pc_tile_metadata(pc_tile_path, oid_and_size=None):
         oid, size = get_hash_and_size_of_file(pc_tile_path)
 
     name = Path(pc_tile_path).name
-    url = str(pc_tile_path) if str(pc_tile_path).startswith("s3://") else None
+    url = pc_tile_path if pc_tile_path.startswith("s3://") else None
     # Keep tile info keys in alphabetical order, except oid and size should be last.
     tile_info = {
         "name": name,

diff --git a/kart/raster/metadata_util.py b/kart/raster/metadata_util.py
@@ -131,16 +131,21 @@ def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None):
     same dataset to be homogenous enough that the meta items format.json, schema.json and crs.wkt
     describe *all* of the tiles in that dataset. The "tile" field is where we keep all information
     that can be different for every tile in the dataset, which is why it must be stored in pointer files.
+
+    pc_tile_path - a pathlib.Path or a string containing the path to a file or an S3 url.
+    oid_and_size - a tuple (sha256_oid, filesize) if already known, to avoid repeated work.
     """
     from osgeo import gdal
 
-    gdal_path_str = str(raster_tile_path)
-    if gdal_path_str.startswith("s3://"):
-        gdal_path_str = gdal_path_str.replace("s3://", "/vsis3/")
-    metadata = gdal.Info(gdal_path_str, options=["-json", "-norat", "-noct"])
+    raster_tile_path = str(raster_tile_path)
+
+    gdal_path_spec = raster_tile_path
+    if gdal_path_spec.startswith("s3://"):
+        gdal_path_spec = gdal_path_spec.replace("s3://", "/vsis3/")
+    metadata = gdal.Info(gdal_path_spec, options=["-json", "-norat", "-noct"])
 
-    full_check = not gdal_path_str.startswith("/vsi")
-    warnings, errors, details = validate_cogtiff(gdal_path_str, full_check=full_check)
+    full_check = not gdal_path_spec.startswith("/vsi")
+    warnings, errors, details = validate_cogtiff(gdal_path_spec, full_check=full_check)
     is_cog = not errors
 
     format_json = {
@@ -160,7 +165,7 @@ def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None):
         oid, size = get_hash_and_size_of_file(raster_tile_path)
 
     name = Path(raster_tile_path).name
-    url = str(raster_tile_path) if str(raster_tile_path).startswith("s3://") else None
+    url = raster_tile_path if raster_tile_path.startswith("s3://") else None
     # Keep tile info keys in alphabetical order, except oid and size should be last.
     tile_info = {
         "name": name,
@@ -240,11 +245,12 @@ def gdalinfo_band_to_kart_columnschema(gdalinfo_band):
 
 
 def _find_and_add_pam_info(raster_tile_path, raster_tile_metadata):
+    raster_tile_path = str(raster_tile_path)
     tile_info = raster_tile_metadata["tile"]
 
-    if str(raster_tile_path).startswith("s3://"):
+    if raster_tile_path.startswith("s3://"):
         try:
-            pam_url = str(raster_tile_path) + PAM_SUFFIX
+            pam_url = raster_tile_path + PAM_SUFFIX
             pam_path = fetch_from_s3(pam_url)
             raster_tile_metadata.update(extract_aux_xml_metadata(pam_path))
             pam_oid, pam_size = get_hash_and_size_of_file(pam_path)

diff --git a/kart/s3_util.py b/kart/s3_util.py
@@ -173,7 +173,7 @@ def fetch_multiple_from_s3(s3_urls_and_paths, quiet=False):
     ) as executor:
         futures = [executor.submit(fetch_from_s3, *args) for args in s3_urls_and_paths]
         for future in concurrent.futures.as_completed(futures):
-            assert future.result() is not None
+            future.result()  # Raises any exception that occurred in the worker thread.
             p.update(1)