Skip to content

Commit

Permalink
Fetch S3 tiles: address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
olsen232 committed Oct 12, 2023
1 parent 1adac79 commit 2edd89c
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 17 deletions.
11 changes: 6 additions & 5 deletions kart/lfs_commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,14 @@ def _do_fetch_from_urls(repo, urls_and_lfs_oids, quiet=False):
f"Invalid URL - only S3 URLs are currently supported for BYOD repos: {non_s3_url}"
)

urls_and_paths = [
(url, get_local_path_from_lfs_hash(repo, lfs_oid))
urls_and_paths_and_oids = [
(url, get_local_path_from_lfs_hash(repo, lfs_oid), lfs_oid)
for (url, lfs_oid) in urls_and_lfs_oids
]
for url, path in urls_and_paths:
path.parent.mkdir(parents=True, exist_ok=True)
fetch_multiple_from_s3(urls_and_paths, quiet=quiet)
path_parents = {path.parent for url, path, lfs_oid in urls_and_paths_and_oids}
for path_parent in path_parents:
path_parent.mkdir(parents=True, exist_ok=True)
fetch_multiple_from_s3(urls_and_paths_and_oids, quiet=quiet)


def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=False):
Expand Down
9 changes: 7 additions & 2 deletions kart/point_cloud/metadata_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,16 @@ def extract_pc_tile_metadata(pc_tile_path, oid_and_size=None):
same dataset to be homogenous enough that the meta items format.json, schema.json and crs.wkt
describe *all* of the tiles in that dataset. The "tile" field is where we keep all information
that can be different for every tile in the dataset, which is why it must be stored in pointer files.
pc_tile_path - a pathlib.Path or a string containing the path to a file or an S3 url.
oid_and_size - a tuple (sha256_oid, filesize) if already known, to avoid repeated work.
"""
pc_tile_path = str(pc_tile_path)

pipeline = [
{
"type": "readers.las",
"filename": str(pc_tile_path),
"filename": pc_tile_path,
"count": 0, # Don't read any individual points.
},
{"type": "filters.info"},
Expand Down Expand Up @@ -198,7 +203,7 @@ def extract_pc_tile_metadata(pc_tile_path, oid_and_size=None):
oid, size = get_hash_and_size_of_file(pc_tile_path)

name = Path(pc_tile_path).name
url = str(pc_tile_path) if str(pc_tile_path).startswith("s3://") else None
url = pc_tile_path if pc_tile_path.startswith("s3://") else None
# Keep tile info keys in alphabetical order, except oid and size should be last.
tile_info = {
"name": name,
Expand Down
24 changes: 15 additions & 9 deletions kart/raster/metadata_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,21 @@ def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None):
same dataset to be homogenous enough that the meta items format.json, schema.json and crs.wkt
describe *all* of the tiles in that dataset. The "tile" field is where we keep all information
that can be different for every tile in the dataset, which is why it must be stored in pointer files.
pc_tile_path - a pathlib.Path or a string containing the path to a file or an S3 url.
oid_and_size - a tuple (sha256_oid, filesize) if already known, to avoid repeated work.
"""
from osgeo import gdal

gdal_path_str = str(raster_tile_path)
if gdal_path_str.startswith("s3://"):
gdal_path_str = gdal_path_str.replace("s3://", "/vsis3/")
metadata = gdal.Info(gdal_path_str, options=["-json", "-norat", "-noct"])
raster_tile_path = str(raster_tile_path)

gdal_path_spec = raster_tile_path
if gdal_path_spec.startswith("s3://"):
gdal_path_spec = gdal_path_spec.replace("s3://", "/vsis3/")
metadata = gdal.Info(gdal_path_spec, options=["-json", "-norat", "-noct"])

full_check = not gdal_path_str.startswith("/vsi")
warnings, errors, details = validate_cogtiff(gdal_path_str, full_check=full_check)
full_check = not gdal_path_spec.startswith("/vsi")
warnings, errors, details = validate_cogtiff(gdal_path_spec, full_check=full_check)
is_cog = not errors

format_json = {
Expand All @@ -160,7 +165,7 @@ def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None):
oid, size = get_hash_and_size_of_file(raster_tile_path)

name = Path(raster_tile_path).name
url = str(raster_tile_path) if str(raster_tile_path).startswith("s3://") else None
url = raster_tile_path if raster_tile_path.startswith("s3://") else None
# Keep tile info keys in alphabetical order, except oid and size should be last.
tile_info = {
"name": name,
Expand Down Expand Up @@ -240,11 +245,12 @@ def gdalinfo_band_to_kart_columnschema(gdalinfo_band):


def _find_and_add_pam_info(raster_tile_path, raster_tile_metadata):
raster_tile_path = str(raster_tile_path)
tile_info = raster_tile_metadata["tile"]

if str(raster_tile_path).startswith("s3://"):
if raster_tile_path.startswith("s3://"):
try:
pam_url = str(raster_tile_path) + PAM_SUFFIX
pam_url = raster_tile_path + PAM_SUFFIX
pam_path = fetch_from_s3(pam_url)
raster_tile_metadata.update(extract_aux_xml_metadata(pam_path))
pam_oid, pam_size = get_hash_and_size_of_file(pam_path)
Expand Down
2 changes: 1 addition & 1 deletion kart/s3_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def fetch_multiple_from_s3(s3_urls_and_paths, quiet=False):
) as executor:
futures = [executor.submit(fetch_from_s3, *args) for args in s3_urls_and_paths]
for future in concurrent.futures.as_completed(futures):
assert future.result() is not None
future.result() # Raises any exception that occurred in the worker thread.
p.update(1)


Expand Down

0 comments on commit 2edd89c

Please sign in to comment.