From 263f32e3e32ff9628d04823a3497445c437315e3 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Mon, 4 Sep 2023 14:38:41 +1200 Subject: [PATCH 1/6] Adds an experimental byod-point-cloud-import command Imports the metadata from some PC tiles hosted on S3 without downloading them, except, right now it does download them, since a) PDAL can't extract metadata from S3 without downloading the whole tile anyway and b) boto3 is now configured to fetch from S3 during testing, but PDAL is not, and in the long run, we might not need it to anyway (see TODO below). Still TODO: use boto3 or similar to download certain ranges of a PC file, which we can then extract metadata from, without downloading all the points. (Or fix PDAL so that it can do this). --- kart/byod/importer.py | 84 +++++++++++++++++++ kart/byod/point_cloud_import.py | 132 ++++++++++++++++++++++++++++++ kart/cli.py | 1 + kart/point_cloud/metadata_util.py | 7 +- kart/s3_util.py | 84 +++++++++++++++++++ kart/tile/importer.py | 27 ++++-- tests/byod/test_imports.py | 69 ++++++++++++++++ tests/conftest.py | 25 ++++++ 8 files changed, 420 insertions(+), 9 deletions(-) create mode 100644 kart/byod/importer.py create mode 100644 kart/byod/point_cloud_import.py create mode 100644 kart/s3_util.py create mode 100644 tests/byod/test_imports.py diff --git a/kart/byod/importer.py b/kart/byod/importer.py new file mode 100644 index 00000000..c4f0feef --- /dev/null +++ b/kart/byod/importer.py @@ -0,0 +1,84 @@ +import logging + +import click + +from kart.fast_import import ( + write_blob_to_stream, +) +from kart.lfs_util import dict_to_pointer_file_bytes +from kart.progress_util import progress_bar +from kart.s3_util import expand_s3_glob + +L = logging.getLogger(__name__) + + +class ByodTileImporter: + """ + Subclassable logic for importing the metadata from tile-based datasets, + while leaving the data in-place on existing hosted storage. + """ + + EXTRACT_TILE_METADATA_STEP = "Fetching tile metadata" + + def sanity_check_sources(self, sources): + for source in sources: + if not source.startswith("s3://"): + raise click.UsageError(f"SOURCE {source} should be an S3 url") + + for source in list(sources): + if "*" in source: + sources.remove(source) + sources += expand_s3_glob(source) + + def extract_tile_metadata(self, tile_location): + raise NotImplementedError() + + def get_conversion_func(self, source_metadata): + return None + + def import_tiles_to_stream(self, stream, sources): + progress = progress_bar( + total=len(sources), unit="tile", desc="Writing tile metadata" + ) + with progress as p: + for source in sources: + tilename = self.DATASET_CLASS.tilename_from_path(source) + rel_blob_path = self.DATASET_CLASS.tilename_to_blob_path( + tilename, relative=True + ) + blob_path = f"{self.dataset_inner_path}/{rel_blob_path}" + + # Check if tile has already been imported previously: + if self.existing_dataset is not None: + existing_summary = self.existing_dataset.get_tile_summary( + tilename, missing_ok=True + ) + if existing_summary: + source_oid = self.source_to_hash_and_size[source][0] + if self.existing_tile_matches_source( + source_oid, existing_summary + ): + # This tile has already been imported before. Reuse it rather than re-importing it. + # Re-importing it could cause it to be re-converted, which is a waste of time, + # and it may not convert the same the second time, which is then a waste of space + # and shows up as a pointless diff. + write_blob_to_stream( + stream, + blob_path, + (self.existing_dataset.inner_tree / rel_blob_path).data, + ) + self.include_existing_metadata = True + continue + + # Tile hasn't been imported previously. + pointer_data = dict_to_pointer_file_bytes( + self.source_to_metadata[source]["tile"] + ) + write_blob_to_stream(stream, blob_path, pointer_data) + + p.update(1) + + self.source_to_imported_metadata = self.source_to_metadata + + def prompt_for_convert_to_cloud_optimized(self): + return False diff --git a/kart/byod/point_cloud_import.py b/kart/byod/point_cloud_import.py new file mode 100644 index 00000000..6c9f7629 --- /dev/null +++ b/kart/byod/point_cloud_import.py @@ -0,0 +1,132 @@ +import logging + +import click + +from kart.byod.importer import ByodTileImporter +from kart.cli_util import StringFromFile, MutexOption, KartCommand +from kart.point_cloud.import_ import PointCloudImporter +from kart.point_cloud.metadata_util import extract_pc_tile_metadata +from kart.s3_util import get_hash_and_size_of_s3_object, fetch_from_s3 + + +L = logging.getLogger(__name__) + + +@click.command("byod-point-cloud-import", hidden=True, cls=KartCommand) +@click.pass_context +@click.option( + "--message", + "-m", + type=StringFromFile(encoding="utf-8"), + help="Commit message. By default this is auto-generated.", +) +@click.option( + "--checkout/--no-checkout", + "do_checkout", + is_flag=True, + default=True, + help="Whether to create a working copy once the import is finished, if no working copy exists yet.", +) +@click.option( + "--replace-existing", + is_flag=True, + cls=MutexOption, + exclusive_with=["--delete", "--update-existing"], + help="Replace existing dataset at the same path.", +) +@click.option( + "--update-existing", + is_flag=True, + cls=MutexOption, + exclusive_with=["--replace-existing"], + help=( + "Update existing dataset at the same path. " + "Tiles will be replaced by source tiles with the same name. " + "Tiles in the existing dataset which are not present in SOURCES will remain untouched." + ), +) +@click.option( + "--delete", + type=StringFromFile(encoding="utf-8"), + cls=MutexOption, + exclusive_with=["--replace-existing"], + multiple=True, + help=("Deletes the given tile. Can be used multiple times."), +) +@click.option( + "--amend", + default=False, + is_flag=True, + help="Amend the previous commit instead of adding a new commit", +) +@click.option( + "--allow-empty", + is_flag=True, + default=False, + help=( + "Usually recording a commit that has the exact same tree as its sole " + "parent commit is a mistake, and the command prevents you from making " + "such a commit. This option bypasses the safety" + ), +) +@click.option( + "--num-workers", + "--num-processes", + type=click.INT, + help="How many import workers to run in parallel. Defaults to the number of available CPU cores.", + default=None, + hidden=True, +) +@click.option("--dataset-path", "--dataset", help="The dataset's path once imported") +@click.argument( + "sources", + nargs=-1, + metavar="SOURCE [SOURCES...]", +) +def byod_point_cloud_import( + ctx, + message, + do_checkout, + replace_existing, + update_existing, + delete, + amend, + allow_empty, + num_workers, + dataset_path, + sources, +): + """ + Experimental. Import a dataset of point-cloud tiles from S3. Doesn't fetch the tiles, does store the tiles original location. + + SOURCES should be one or more LAZ or LAS files (or wildcards that match multiple LAZ or LAS files). + """ + repo = ctx.obj.repo + + ByodPointCloudImporter(repo, ctx).import_tiles( + convert_to_cloud_optimized=False, + dataset_path=dataset_path, + message=message, + do_checkout=do_checkout, + replace_existing=replace_existing, + update_existing=update_existing, + delete=delete, + amend=amend, + allow_empty=allow_empty, + sources=list(sources), + num_workers=num_workers, + ) + + +class ByodPointCloudImporter(ByodTileImporter, PointCloudImporter): + def extract_tile_metadata(self, tile_location): + oid_and_size = get_hash_and_size_of_s3_object(tile_location) + # TODO - download only certain ranges of the file, and extract metadata from those. + tmp_downloaded_tile = fetch_from_s3(tile_location) + result = extract_pc_tile_metadata( + tmp_downloaded_tile, oid_and_size=oid_and_size + ) + tmp_downloaded_tile.unlink() + # TODO - format still not definite, we might not put the whole URL in here. + result["tile"]["url"] = tile_location + return result diff --git a/kart/cli.py b/kart/cli.py index 063dc6d9..4f2eda1f 100755 --- a/kart/cli.py +++ b/kart/cli.py @@ -54,6 +54,7 @@ "point_cloud.import_": {"point-cloud-import"}, "install": {"install"}, "add_dataset": {"add-dataset"}, + "byod.point_cloud_import": {"byod-point-cloud-import"}, } # These commands aren't valid Python symbols, even when we change dash to underscore. diff --git a/kart/point_cloud/metadata_util.py b/kart/point_cloud/metadata_util.py index fc036cd5..f97120fa 100644 --- a/kart/point_cloud/metadata_util.py +++ b/kart/point_cloud/metadata_util.py @@ -141,7 +141,7 @@ def get_native_extent(info): ) -def extract_pc_tile_metadata(pc_tile_path): +def extract_pc_tile_metadata(pc_tile_path, oid_and_size=None): """ Use pdal to get any and all point-cloud metadata we can make use of in Kart. This includes metadata that must be dataset-homogenous and would be stored in the dataset's /meta/ folder, @@ -192,7 +192,10 @@ def extract_pc_tile_metadata(pc_tile_path): } schema_json = pdal_schema_to_kart_schema(metadata["filters.info"]["schema"]) - oid, size = get_hash_and_size_of_file(pc_tile_path) + if oid_and_size: + oid, size = oid_and_size + else: + oid, size = get_hash_and_size_of_file(pc_tile_path) # Keep tile info keys in alphabetical order, except oid and size should be last. tile_info = { diff --git a/kart/s3_util.py b/kart/s3_util.py new file mode 100644 index 00000000..93d42e52 --- /dev/null +++ b/kart/s3_util.py @@ -0,0 +1,84 @@ +from base64 import standard_b64decode +import functools +import os +from pathlib import Path +import tempfile +from urllib.parse import urlparse + +import boto3 + +# Utility functions for dealing with S3 - not yet launched. + + +@functools.lru_cache(maxsize=1) +def get_s3_config(): + # TODO - add an option --s3-region to commands where it would be useful. + return None + + +@functools.lru_cache(maxsize=1) +def get_s3_client(): + return boto3.client("s3", config=get_s3_config()) + + +@functools.lru_cache(maxsize=1) +def get_s3_resource(): + return boto3.resource("s3", config=get_s3_config()) + + +@functools.lru_cache() +def get_bucket(name): + return get_s3_resource().Bucket(name) + + +def fetch_from_s3(s3_url, output_path=None): + """ + Downloads the object at s3_url to output_path. + If output-path is not set, creates a temporary file using tempfile.mkstemp() + """ + # TODO: handle failure. + parsed = urlparse(s3_url) + bucket = get_bucket(parsed.netloc) + if output_path is None: + fd, path = tempfile.mkstemp() + # If we keep it open, boto3 won't be able to write to it (on Windows): + os.close(fd) + output_path = Path(path) + bucket.download_file(parsed.path.lstrip("/"), str(output_path.resolve())) + return output_path + + +def expand_s3_glob(source_spec): + """ + Given an s3_path with wildcard in, uses prefix and suffix matching to find all S3 objects that match. + """ + # TODO: handle any kind of failure, sanity check to make sure we don't match a million objects. + if "*" not in source_spec: + yield source_spec + return + else: + parsed = urlparse(source_spec) + bucket = get_bucket(parsed.netloc) + prefix, suffix = parsed.path.split("*", maxsplit=1) + prefix = prefix.lstrip("/") + matches = bucket.objects.filter(Prefix=prefix) + for match in matches: + if match.key.endswith(suffix): + yield f"s3://{match.bucket_name}/{match.key}" + + +def get_hash_and_size_of_s3_object(s3_url): + """Returns the (SHA256-hash-in-Base64, filesize) of an S3 object.""" + parsed = urlparse(s3_url) + bucket = parsed.netloc + key = parsed.path.lstrip("/") + response = get_s3_client().get_object_attributes( + Bucket=bucket, + Key=key, + ObjectAttributes=["Checksum", "ObjectSize"], + ) + # TODO - handle failure (eg missing SHA256 checksum), which is extremely likely. + sha256 = standard_b64decode(response["Checksum"]["ChecksumSHA256"]).hex() + size = response["ObjectSize"] + + return sha256, size diff --git a/kart/tile/importer.py b/kart/tile/importer.py index 75881527..8d5a4b66 100644 --- a/kart/tile/importer.py +++ b/kart/tile/importer.py @@ -4,6 +4,7 @@ import math import os from pathlib import Path +import re import sys import uuid @@ -66,6 +67,8 @@ def __init__(self, repo, ctx): # even though it's not really relevant to tile imports. assert self.repo.table_dataset_version in SUPPORTED_VERSIONS + EXTRACT_TILE_METADATA_STEP = "Checking tiles" + def import_tiles( self, *, @@ -149,9 +152,7 @@ def import_tiles( # we're importing (or even a subset of that dataset). But this'll do for now self.repo.working_copy.workdir.check_not_dirty() - for source in sources: - if not (Path() / source).is_file(): - raise NotFound(f"No data found at {source}", exit_code=NO_IMPORT_SOURCE) + self.sanity_check_sources(sources) self.existing_dataset = self.get_existing_dataset() self.existing_metadata = ( @@ -180,7 +181,7 @@ def import_tiles( ) progress = progress_bar( - total=len(sources), unit="tile", desc="Checking tiles" + total=len(sources), unit="tile", desc=self.EXTRACT_TILE_METADATA_STEP ) with progress as p: for source, tile_metadata in self.extract_multiple_tiles_metadata( @@ -201,7 +202,7 @@ def import_tiles( all_metadata = list(self.source_to_metadata.values()) merged_source_metadata = self.get_merged_source_metadata(all_metadata) self.check_for_non_homogenous_metadata( - merged_source_metadata, future_tense=0 + merged_source_metadata, future_tense=False ) if self.include_existing_metadata: all_metadata.append(self.existing_metadata) @@ -323,8 +324,6 @@ def import_tiles( def infer_dataset_path(self, sources): """Given a list of sources to import, choose a reasonable name for the dataset.""" - if len(sources) == 1: - return self.DATASET_CLASS.remove_tile_extension(Path(sources[0]).name) names = set() parent_names = set() for source in sources: @@ -344,6 +343,20 @@ def _common_prefix(self, collection, min_length=4): return None return prefix + URI_PATTERN = re.compile(r"([A-Za-z0-9-]{,20})://") + + def sanity_check_sources(self, sources): + for source in sources: + m = self.URI_PATTERN.match(source) + if m: + raise click.UsageError( + f"SOURCE {source} should be a path to a file, not a {m.group(1)} URI" + ) + + for source in sources: + if not (Path() / source).is_file(): + raise NotFound(f"No data found at {source}", exit_code=NO_IMPORT_SOURCE) + def get_default_message(self): """Return a default commit message to describe this import.""" raise NotImplementedError() diff --git a/tests/byod/test_imports.py b/tests/byod/test_imports.py new file mode 100644 index 00000000..698ad245 --- /dev/null +++ b/tests/byod/test_imports.py @@ -0,0 +1,69 @@ +import json +import os + +import pytest + + +@pytest.mark.slow +def test_byod_point_cloud_import( + tmp_path, + chdir, + cli_runner, + s3_test_data_point_clouds, +): + # Using postgres here because it has the best type preservation + repo_path = tmp_path / "point-cloud-repo" + r = cli_runner.invoke(["init", repo_path]) + assert r.exit_code == 0 + + with chdir(repo_path): + r = cli_runner.invoke( + [ + "byod-point-cloud-import", + s3_test_data_point_clouds, + "--dataset-path=auckland", + ] + ) + assert r.exit_code == 0, r.stderr + + r = cli_runner.invoke(["data", "ls"]) + assert r.exit_code == 0, r.stderr + assert r.stdout.splitlines() == ["auckland"] + + r = cli_runner.invoke(["show", "-o", "json"]) + assert r.exit_code == 0, r.stderr + output = json.loads(r.stdout) + auckland = output["kart.diff/v1+hexwkb"]["auckland"] + assert auckland["meta"]["schema.json"]["+"] == [ + {"name": "X", "dataType": "float", "size": 64}, + {"name": "Y", "dataType": "float", "size": 64}, + {"name": "Z", "dataType": "float", "size": 64}, + {"name": "Intensity", "dataType": "integer", "size": 16}, + {"name": "ReturnNumber", "dataType": "integer", "size": 8}, + {"name": "NumberOfReturns", "dataType": "integer", "size": 8}, + {"name": "ScanDirectionFlag", "dataType": "integer", "size": 8}, + {"name": "EdgeOfFlightLine", "dataType": "integer", "size": 8}, + {"name": "Classification", "dataType": "integer", "size": 8}, + {"name": "ScanAngleRank", "dataType": "float", "size": 32}, + {"name": "UserData", "dataType": "integer", "size": 8}, + {"name": "PointSourceId", "dataType": "integer", "size": 16}, + {"name": "GpsTime", "dataType": "float", "size": 64}, + {"name": "Red", "dataType": "integer", "size": 16}, + {"name": "Green", "dataType": "integer", "size": 16}, + {"name": "Blue", "dataType": "integer", "size": 16}, + ] + + tile_0_url = os.path.join( + s3_test_data_point_clouds.split("*")[0], "auckland_0_0.laz" + ) + + assert auckland["tile"][0]["+"] == { + "name": "auckland_0_0.laz", + "crs84Extent": "POLYGON((174.7384483 -36.8512371,174.7382443 -36.8422277,174.7494540 -36.8420632,174.7496594 -36.8510726,174.7384483 -36.8512371))", + "format": "laz-1.2", + "nativeExtent": "1754987.85,1755987.77,5920219.76,5921219.64,-1.66,99.83", + "pointCount": 4231, + "url": tile_0_url, + "oid": "sha256:6b980ce4d7f4978afd3b01e39670e2071a792fba441aca45be69be81cb48b08c", + "size": 51489, + } diff --git a/tests/conftest.py b/tests/conftest.py index a7d1dbd0..99d1470f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1166,6 +1166,31 @@ def ctx(create=False): return ctx +@pytest.fixture() +def s3_test_data_point_clouds(monkeypatch_session): + """ + You can run tests that fetch a copy of the auckland test data from S3 (and so test Kart's S3 behaviour) + by setting KART_S3_TEST_DATA_POINT_CLOUDS=s3://some-bucket/path-to-auckland-tiles/*.laz + The tiles hosted there should be the ones found in tests/data/point-cloud/laz-auckland.tgz + """ + if "KART_S3_TEST_DATA_POINT_CLOUDS" not in os.environ: + raise pytest.skip( + "S3 tests require configuration - read docstring at conftest.s3_test_data_point_clouds" + ) + + # $HOME isn't the user's real homedir during tests - look for AWS_CONFIG_FILE in the real homedir, + # unless AWS_CONFIG_FILE is already set to look somewhere else. Same for AWS_SHARED_CREDENTIALS_FILE. + for var in ("AWS_CONFIG_FILE", "AWS_SHARED_CREDENTIALS_FILE"): + if var not in os.environ: + orig_home = os.path.expanduser("~" + os.getenv("USER", "")) + filename = var.split("_")[-2].lower() + config_path = os.path.join(orig_home, ".aws", filename) + if os.path.exists(config_path): + monkeypatch_session.setenv(var, config_path) + + return os.environ["KART_S3_TEST_DATA_POINT_CLOUDS"] + + @pytest.fixture() def dodgy_restore(cli_runner): """ From 88b8fad945e718c9d53d01a28768976a4983abb5 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Thu, 7 Sep 2023 14:32:04 +1200 Subject: [PATCH 2/6] Add boto3 to requirements --- requirements/dev.txt | 1 + requirements/docs.txt | 7 +++++-- requirements/requirements.in | 1 + requirements/requirements.txt | 20 +++++++++++++++++++- requirements/test.txt | 1 + 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 578657be..25ce47b8 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -55,6 +55,7 @@ pygments==2.13.0 six==1.16.0 # via # -c docs.txt + # -c requirements.txt # -c test.txt # asttokens stack-data==0.6.2 diff --git a/requirements/docs.txt b/requirements/docs.txt index e0de61fb..b1e2501f 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -47,6 +47,7 @@ requests==2.31.0 # via sphinx six==1.16.0 # via + # -c requirements.txt # -c test.txt # livereload snowballstemmer==2.2.0 @@ -73,5 +74,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx tornado==6.2 # via livereload -urllib3==1.26.13 - # via requests +urllib3==1.26.16 + # via + # -c requirements.txt + # requests diff --git a/requirements/requirements.in b/requirements/requirements.in index e01e61ab..a272faf7 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,3 +1,4 @@ +boto3 certifi click~=8.1 docutils<0.18 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index dbacde23..41519dec 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -6,6 +6,12 @@ # attrs==22.1.0 # via jsonschema +boto3==1.28.42 + # via -r requirements.in +botocore==1.31.42 + # via + # boto3 + # s3transfer certifi==2022.12.7 # via -r requirements.in #cffi==1.15.1 @@ -16,7 +22,7 @@ certifi==2022.12.7 # reflink click==8.1.4 # via -r requirements.in -#cryptography==41.0.0 +#cryptography==41.0.3 # via -r vendor-wheels.txt docutils==0.17.1 # via @@ -26,6 +32,10 @@ docutils==0.17.1 # via -r vendor-wheels.txt greenlet==2.0.1 # via sqlalchemy +jmespath==1.0.1 + # via + # boto3 + # botocore jsonschema==4.17.3 # via -r requirements.in msgpack==0.6.2 @@ -48,13 +58,21 @@ pyrsistent==0.19.2 # via jsonschema #pysqlite3==0.4.5 # via -r vendor-wheels.txt +python-dateutil==2.8.2 + # via botocore #reflink==0.2.1 # via -r vendor-wheels.txt rst2txt==1.1.0 # via -r requirements.in +s3transfer==0.6.2 + # via boto3 shellingham==1.5.0 # via -r requirements.in +six==1.16.0 + # via python-dateutil sqlalchemy==1.4.45 # via -r requirements.in tqdm==4.64.1 # via -r requirements.in +urllib3==1.26.16 + # via botocore diff --git a/requirements/test.txt b/requirements/test.txt index 0b7159b1..ca2ba219 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -65,6 +65,7 @@ pytest-xdist==3.1.0 # via -r test.in six==1.16.0 # via + # -c requirements.txt # html5lib # pytest-profiling termcolor==2.1.1 From b267d1035c6d932c1c04b2f5b7d9421769631e68 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Fri, 8 Sep 2023 14:01:08 +1200 Subject: [PATCH 3/6] Add byod as python package --- kart.spec | 1 + kart/byod/__init__.py | 0 2 files changed, 1 insertion(+) create mode 100644 kart/byod/__init__.py diff --git a/kart.spec b/kart.spec index 8dbae594..4b538c45 100644 --- a/kart.spec +++ b/kart.spec @@ -157,6 +157,7 @@ pyi_analysis = Analysis( # TODO: improve this somehow *collect_submodules('kart'), *collect_submodules('kart.annotations'), + *collect_submodules('kart.byod'), *collect_submodules('kart.lfs_commands'), *collect_submodules('kart.point_cloud'), *collect_submodules('kart.sqlalchemy'), diff --git a/kart/byod/__init__.py b/kart/byod/__init__.py new file mode 100644 index 00000000..e69de29b From 71452918ea59119098a5201e5a19f3fba1f4ca74 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Fri, 8 Sep 2023 14:48:03 +1200 Subject: [PATCH 4/6] experimental byod-import: address review comments --- .github/workflows/build.yml | 4 ++++ kart/s3_util.py | 43 ++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8605b406..da85ed6c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -31,6 +31,7 @@ jobs: KART_POSTGIS_URL: "postgresql://postgres:@localhost:5432/postgres" KART_SQLSERVER_URL: "mssql://sa:PassWord1@localhost:1433/master" KART_MYSQL_URL: "mysql://root:PassWord1@localhost:3306" + KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" services: postgis: @@ -268,6 +269,7 @@ jobs: KART_POSTGIS_URL: "postgresql://postgres:@postgis:5432/postgres" SQLSERVER_URL: "mssql://sa:PassWord1@sqlserver:1433/master?TrustServerCertificate=yes" KART_MYSQL_URL: "mysql://root:PassWord1@mysql:3306" + KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" ARCH: "${{ matrix.os.arch }}" ARCH_TRIPLET: "${{ matrix.os.arch_triplet }}" GITHUB_WORKSPACE: "/__w/kart/kart" # FIXME? @@ -739,6 +741,7 @@ jobs: MACOS_NOTARIZE_KEYCHAIN_PROFILE: "NOTARIZE_AUTH" # X.Y version needs to match PY_VER: PY_VER_INSTALLER: "https://www.python.org/ftp/python/3.10.9/python-3.10.9-macos11.pkg" + KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" steps: - uses: actions/checkout@v3 @@ -983,6 +986,7 @@ jobs: env: NINJA_VERSION: "~1.10.2" # workaround for python logging output buffering noise SIGN_AZURE_CERTIFICATE: ${{ secrets.WIN_SIGN_AZURE_CERTIFICATE }} + KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" # We want to run on external PRs, but not on our own internal PRs as they'll be run # by the push to the branch. diff --git a/kart/s3_util.py b/kart/s3_util.py index 93d42e52..a255725f 100644 --- a/kart/s3_util.py +++ b/kart/s3_util.py @@ -6,6 +6,9 @@ from urllib.parse import urlparse import boto3 +import click + +from kart.exceptions import NotFound, NO_IMPORT_SOURCE # Utility functions for dealing with S3 - not yet launched. @@ -50,21 +53,37 @@ def fetch_from_s3(s3_url, output_path=None): def expand_s3_glob(source_spec): """ - Given an s3_path with wildcard in, uses prefix and suffix matching to find all S3 objects that match. + Given an s3_path with '*' wildcard in, uses prefix and suffix matching to find all S3 objects that match. + Subdirectories (or the S3 equivalent - S3 is not exactly a directory hierarchy) are not matched - + that is, s3://bucket/path/*.txt matches s3://bucket/path/example.txt but not s3://bucket/path/subpath/example.txt """ # TODO: handle any kind of failure, sanity check to make sure we don't match a million objects. if "*" not in source_spec: - yield source_spec - return - else: - parsed = urlparse(source_spec) - bucket = get_bucket(parsed.netloc) - prefix, suffix = parsed.path.split("*", maxsplit=1) - prefix = prefix.lstrip("/") - matches = bucket.objects.filter(Prefix=prefix) - for match in matches: - if match.key.endswith(suffix): - yield f"s3://{match.bucket_name}/{match.key}" + return [source_spec] + + parsed = urlparse(source_spec) + prefix, suffix = parsed.path.split("*", maxsplit=1) + if "*" in suffix: + raise click.UsageError( + f"Two wildcards '*' found in {source_spec} - only one wildcard is supported" + ) + prefix = prefix.lstrip("/") + prefix_len = len(prefix) + + bucket = get_bucket(parsed.netloc) + matches = bucket.objects.filter(Prefix=prefix) + result = [] + for match in matches: + assert match.key.startswith(prefix) + match_suffix = match.key[prefix_len:] + if match_suffix.endswith(suffix) and "/" not in match_suffix: + result.append(f"s3://{match.bucket_name}/{match.key}") + + if not result: + raise NotFound( + f"No S3 objects found at {source_spec}", exit_code=NO_IMPORT_SOURCE + ) + return result def get_hash_and_size_of_s3_object(s3_url): From 7f339313d88cbf4a04d9ae8520d81ee994c16269 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Mon, 18 Sep 2023 11:35:19 +1200 Subject: [PATCH 5/6] Fix test config to work for arbiter as well as boto3 --- kart/byod/point_cloud_import.py | 6 +----- tests/conftest.py | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/kart/byod/point_cloud_import.py b/kart/byod/point_cloud_import.py index 6c9f7629..be1b32a3 100644 --- a/kart/byod/point_cloud_import.py +++ b/kart/byod/point_cloud_import.py @@ -122,11 +122,7 @@ class ByodPointCloudImporter(ByodTileImporter, PointCloudImporter): def extract_tile_metadata(self, tile_location): oid_and_size = get_hash_and_size_of_s3_object(tile_location) # TODO - download only certain ranges of the file, and extract metadata from those. - tmp_downloaded_tile = fetch_from_s3(tile_location) - result = extract_pc_tile_metadata( - tmp_downloaded_tile, oid_and_size=oid_and_size - ) - tmp_downloaded_tile.unlink() + result = extract_pc_tile_metadata(tile_location, oid_and_size=oid_and_size) # TODO - format still not definite, we might not put the whole URL in here. result["tile"]["url"] = tile_location return result diff --git a/tests/conftest.py b/tests/conftest.py index 99d1470f..40fdb12c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1166,6 +1166,18 @@ def ctx(create=False): return ctx +USER = os.getenv("USER", "") + +DOT_AWS_FILES = { + os.path.join(f"~{USER}", ".aws", "config"): ["AWS_CONFIG_FILE"], + os.path.join(f"~{USER}", ".aws", "credentials"): [ + # Either one of these can inform boto3 where to look, but Arbiter only respects the first. + "AWS_CREDENTIAL_FILE", + "AWS_SHARED_CREDENTIALS_FILE", + ], +} + + @pytest.fixture() def s3_test_data_point_clouds(monkeypatch_session): """ @@ -1179,14 +1191,18 @@ def s3_test_data_point_clouds(monkeypatch_session): ) # $HOME isn't the user's real homedir during tests - look for AWS_CONFIG_FILE in the real homedir, - # unless AWS_CONFIG_FILE is already set to look somewhere else. Same for AWS_SHARED_CREDENTIALS_FILE. - for var in ("AWS_CONFIG_FILE", "AWS_SHARED_CREDENTIALS_FILE"): - if var not in os.environ: - orig_home = os.path.expanduser("~" + os.getenv("USER", "")) - filename = var.split("_")[-2].lower() - config_path = os.path.join(orig_home, ".aws", filename) - if os.path.exists(config_path): - monkeypatch_session.setenv(var, config_path) + # unless AWS_CONFIG_FILE is already set to look somewhere else. Same for AWS_CREDENTIAL_FILE. + for path, env_vars in DOT_AWS_FILES.items(): + val = any(os.environ.get(k) for k in env_vars) + if not val: + path = os.path.expanduser(path) + if os.path.exists(path): + val = path + if not val: + continue + for k in env_vars: + if k not in os.environ: + os.environ[k] = val return os.environ["KART_S3_TEST_DATA_POINT_CLOUDS"] From 2195933496654e0e34b95e63c87f54e3e6941e54 Mon Sep 17 00:00:00 2001 From: Andrew Olsen Date: Tue, 19 Sep 2023 16:10:11 +1200 Subject: [PATCH 6/6] Fix S3 tests --- .github/workflows/build.yml | 6 ++---- kart/byod/point_cloud_import.py | 6 +++++- kart/s3_util.py | 21 ++++++++++++--------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da85ed6c..036a545c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,6 +7,8 @@ env: PY_VER: "3.10" CMAKE_VERSION: "~3.25.0" FORCE_COLOR: "YES" + KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" + AWS_NO_SIGN_REQUEST: "1" jobs: # @@ -31,7 +33,6 @@ jobs: KART_POSTGIS_URL: "postgresql://postgres:@localhost:5432/postgres" KART_SQLSERVER_URL: "mssql://sa:PassWord1@localhost:1433/master" KART_MYSQL_URL: "mysql://root:PassWord1@localhost:3306" - KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" services: postgis: @@ -269,7 +270,6 @@ jobs: KART_POSTGIS_URL: "postgresql://postgres:@postgis:5432/postgres" SQLSERVER_URL: "mssql://sa:PassWord1@sqlserver:1433/master?TrustServerCertificate=yes" KART_MYSQL_URL: "mysql://root:PassWord1@mysql:3306" - KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" ARCH: "${{ matrix.os.arch }}" ARCH_TRIPLET: "${{ matrix.os.arch_triplet }}" GITHUB_WORKSPACE: "/__w/kart/kart" # FIXME? @@ -741,7 +741,6 @@ jobs: MACOS_NOTARIZE_KEYCHAIN_PROFILE: "NOTARIZE_AUTH" # X.Y version needs to match PY_VER: PY_VER_INSTALLER: "https://www.python.org/ftp/python/3.10.9/python-3.10.9-macos11.pkg" - KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" steps: - uses: actions/checkout@v3 @@ -986,7 +985,6 @@ jobs: env: NINJA_VERSION: "~1.10.2" # workaround for python logging output buffering noise SIGN_AZURE_CERTIFICATE: ${{ secrets.WIN_SIGN_AZURE_CERTIFICATE }} - KART_S3_TEST_DATA_POINT_CLOUDS: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" # We want to run on external PRs, but not on our own internal PRs as they'll be run # by the push to the branch. diff --git a/kart/byod/point_cloud_import.py b/kart/byod/point_cloud_import.py index be1b32a3..6c9f7629 100644 --- a/kart/byod/point_cloud_import.py +++ b/kart/byod/point_cloud_import.py @@ -122,7 +122,11 @@ class ByodPointCloudImporter(ByodTileImporter, PointCloudImporter): def extract_tile_metadata(self, tile_location): oid_and_size = get_hash_and_size_of_s3_object(tile_location) # TODO - download only certain ranges of the file, and extract metadata from those. - result = extract_pc_tile_metadata(tile_location, oid_and_size=oid_and_size) + tmp_downloaded_tile = fetch_from_s3(tile_location) + result = extract_pc_tile_metadata( + tmp_downloaded_tile, oid_and_size=oid_and_size + ) + tmp_downloaded_tile.unlink() # TODO - format still not definite, we might not put the whole URL in here. result["tile"]["url"] = tile_location return result diff --git a/kart/s3_util.py b/kart/s3_util.py index a255725f..f9d94279 100644 --- a/kart/s3_util.py +++ b/kart/s3_util.py @@ -21,12 +21,18 @@ def get_s3_config(): @functools.lru_cache(maxsize=1) def get_s3_client(): - return boto3.client("s3", config=get_s3_config()) + client = boto3.client("s3", config=get_s3_config()) + if "AWS_NO_SIGN_REQUEST" in os.environ: + client._request_signer.sign = lambda *args, **kwargs: None + return client @functools.lru_cache(maxsize=1) def get_s3_resource(): - return boto3.resource("s3", config=get_s3_config()) + resource = boto3.resource("s3", config=get_s3_config()) + if "AWS_NO_SIGN_REQUEST" in os.environ: + resource.meta.client._request_signer.sign = lambda *args, **kwargs: None + return resource @functools.lru_cache() @@ -91,13 +97,10 @@ def get_hash_and_size_of_s3_object(s3_url): parsed = urlparse(s3_url) bucket = parsed.netloc key = parsed.path.lstrip("/") - response = get_s3_client().get_object_attributes( - Bucket=bucket, - Key=key, - ObjectAttributes=["Checksum", "ObjectSize"], + response = get_s3_client().head_object( + Bucket=bucket, Key=key, ChecksumMode="ENABLED" ) # TODO - handle failure (eg missing SHA256 checksum), which is extremely likely. - sha256 = standard_b64decode(response["Checksum"]["ChecksumSHA256"]).hex() - size = response["ObjectSize"] - + sha256 = standard_b64decode(response["ChecksumSHA256"]).hex() + size = response["ContentLength"] return sha256, size