Skip to content

Commit

Permalink
feat: create collection TDE-453 (#177)
Browse files Browse the repository at this point in the history
* feat: create add items to collection from json files

* fix: calm mypy

* fix: formatting

* feat: change arguments

based on being run after flatten

* fix: use paginator

* fix: format
  • Loading branch information
MDavidson17 authored Oct 25, 2022
1 parent 0229df8 commit 20f5f34
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 30 deletions.
69 changes: 69 additions & 0 deletions scripts/collection_from_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import argparse
import json
import os

from boto3 import client
from linz_logger import get_log

from scripts.files.files_helper import is_json
from scripts.files.fs import read, write
from scripts.files.fs_s3 import bucket_name_from_path, prefix_from_path
from scripts.stac.imagery.collection import ImageryCollection


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--uri", dest="uri", help="s3 path to items and collection.json write location", required=True)
parser.add_argument("--collection_id", dest="collection_id", required=True)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)

arguments = parser.parse_args()

uri = arguments.uri
collection = ImageryCollection(
title=arguments.title, description=arguments.description, collection_id=arguments.collection_id
)

if not uri.startswith("s3://"):
msg = f"uri is not a s3 path: {uri}"
raise argparse.ArgumentTypeError(msg)

s3_client = client("s3")

paginator = s3_client.get_paginator("list_objects_v2")
response_iterator = paginator.paginate(Bucket=bucket_name_from_path(uri), Prefix=prefix_from_path(uri))
for response in response_iterator:
for contents_data in response["Contents"]:
key = contents_data["Key"]

file = os.path.join(f"s3://{bucket_name_from_path(uri)}", key)

if not is_json(file):
get_log().info("skipping file as not json", file=file, action="collection_from_items", reason="skip")
continue

item_stac = json.loads(read(file).decode("utf-8"))

if not arguments.collection_id == item_stac["collection"]:
get_log().info(
"skipping file as item.collection does not match collection_id",
file=file,
action="collection_from_items",
reason="skip",
)
continue

collection.add_item(item_stac)
get_log().info("item added to collection", item=item_stac["id"], file=file)

valid_item_count = [dictionary["rel"] for dictionary in collection.stac["links"]].count("item")
get_log().info("All valid items added to collection", valid_item_count=valid_item_count)

destination = os.path.join(uri, "collection.json")
write(destination, json.dumps(collection.stac).encode("utf-8"))
get_log().info("collection written", destination=destination)


if __name__ == "__main__":
main()
26 changes: 13 additions & 13 deletions scripts/create_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,41 +19,40 @@ def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--source", dest="source", nargs="+", required=True)
parser.add_argument("--collection_id", dest="collection_id", help="Unique id for collection", required=False)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)
parser.add_argument(
"--start_datetime", dest="start_datetime", help="start datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument(
"--end_datetime", dest="end_datetime", help="end datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)

arguments = parser.parse_args()

source = format_source(arguments.source)
title = arguments.title
description = arguments.description
collection_id = arguments.collection_id
start_datetime = format_date(arguments.start_datetime)
end_datetime = format_date(arguments.end_datetime)

if arguments.collection_id:
collection = ImageryCollection(title=arguments.title, description=arguments.description, collection_id=arguments.ulid)
collection = ImageryCollection(title=title, description=description, collection_id=collection_id)
else:
collection = ImageryCollection(title=arguments.title, description=arguments.description)
collection = ImageryCollection(title=title, description=description)

for file in source:
if not is_tiff(file):
get_log().trace("file_not_tiff_skipped", file=file)
continue
gdalinfo_result = gdal_info(file)
item = create_item(
file,
format_date(arguments.start_datetime),
format_date(arguments.end_datetime),
arguments.collection_id,
gdalinfo_result,
)
item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result)
tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json")
write(tmp_file_path, json.dumps(item.stac).encode("utf-8"))
get_log().info("imagery_stac_item_created", file=file)
get_log().info("stac item written to tmp", location=tmp_file_path)

collection.add_item(item)
collection.add_item(item.stac)

tmp_file_path = os.path.join("/tmp/", "collection.json")
write(tmp_file_path, json.dumps(collection.stac).encode("utf-8"))
Expand All @@ -78,6 +77,7 @@ def create_item(
item.update_spatial(geometry, bbox)
item.add_collection(collection_id)

get_log().info("imagery stac item created", file=file)
return item


Expand Down
10 changes: 10 additions & 0 deletions scripts/files/fs_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,13 @@ def read(path: str, needs_credentials: bool = False) -> bytes:

get_log().debug("read_s3_success", path=path, duration=time_in_ms() - start_time)
return file


def bucket_name_from_path(path: str) -> str:
path_parts = path.replace("s3://", "").split("/")
return path_parts.pop(0)


def prefix_from_path(path: str) -> str:
bucket_name = bucket_name_from_path(path)
return path.replace(f"s3://{bucket_name}/", "")
14 changes: 6 additions & 8 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from datetime import datetime
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from typing import Any, Dict, List, Optional

import ulid

from scripts.stac.util.STAC_VERSION import STAC_VERSION

if TYPE_CHECKING:
from scripts.stac.imagery.item import ImageryItem


class ImageryCollection:
stac: Dict[str, Any]
Expand All @@ -26,12 +23,13 @@ def __init__(self, title: str, description: str, collection_id: Optional[str] =
"links": [{"rel": "self", "href": "./collection.json", "type": "application/json"}],
}

def add_item(self, item: "ImageryItem") -> None:
item_self_link = next((feat for feat in item.stac["links"] if feat["rel"] == "self"), None)
def add_item(self, item: Dict[Any, Any]) -> None:

item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None)
if item_self_link:
self.add_link(href=item_self_link["href"])
self.update_temporal_extent(item.stac["properties"]["start_datetime"], item.stac["properties"]["end_datetime"])
self.update_spatial_extent(item.stac["bbox"])
self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"])
self.update_spatial_extent(item["bbox"])

def add_link(self, href: str, rel: str = "item", file_type: str = "application/json") -> None:
self.stac["links"].append({"rel": rel, "href": href, "type": file_type})
Expand Down
2 changes: 1 addition & 1 deletion scripts/stac/tests/collection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_add_item(mocker) -> None: # type: ignore
end_datetime = "2021-01-27 00:00:00Z"
item.update_spatial(geometry, bbox)
item.update_datetime(start_datetime, end_datetime)
collection.add_item(item)
collection.add_item(item.stac)

assert {"rel": "item", "href": f"./{id_}.json", "type": "application/json"} in collection.stac["links"]
assert collection.stac["extent"]["temporal"]["interval"] == [[start_datetime, end_datetime]]
Expand Down
14 changes: 6 additions & 8 deletions scripts/standardise_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def main() -> None:
arguments = parser.parse_args()

source = format_source(arguments.source)
start_datetime = format_date(arguments.start_datetime)
end_datetime = format_date(arguments.end_datetime)
collection_id = arguments.collection_id

if is_argo():
concurrency = 4
Expand All @@ -47,16 +50,11 @@ def main() -> None:
continue
gdalinfo_result = gdal_info(file)
qa_file(file, srs, gdalinfo_result)
item = create_item(
file,
format_date(arguments.start_datetime),
format_date(arguments.end_datetime),
arguments.collection_id,
gdalinfo_result,
)
item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result)

tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json")
write(tmp_file_path, json.dumps(item.stac).encode("utf-8"))
get_log().info("imagery_stac_item_created", file=file)
get_log().info("stac item written to tmp", location=tmp_file_path)


if __name__ == "__main__":
Expand Down

0 comments on commit 20f5f34

Please sign in to comment.