From 308b748c1b790a9d5cc4efedae0330a8db01b46b Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 16:01:43 +0100 Subject: [PATCH 1/7] filter by date --- torchgeo/datasets/geo.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torchgeo/datasets/geo.py b/torchgeo/datasets/geo.py index 4c98b048e62..234609d31b1 100644 --- a/torchgeo/datasets/geo.py +++ b/torchgeo/datasets/geo.py @@ -624,6 +624,7 @@ def __init__( # Populate the dataset index i = 0 + filename_regex = re.compile(self.filename_regex, re.VERBOSE) for filepath in self.files: try: with fiona.open(filepath) as src: @@ -640,6 +641,10 @@ def __init__( else: mint = 0 maxt = sys.maxsize + match = re.match(filename_regex, os.path.basename(filepath)) + if "date" in match.groupdict(): + date = match.group("date") + mint, maxt = disambiguate_timestamp(date, self.date_format) coords = (minx, maxx, miny, maxy, mint, maxt) self.index.insert(i, coords, filepath) i += 1 From c634f0ddf47204cf4a2f39f140ae4f65b84324a8 Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 16:30:47 +0100 Subject: [PATCH 2/7] added date format docstring --- torchgeo/datasets/geo.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/torchgeo/datasets/geo.py b/torchgeo/datasets/geo.py index 234609d31b1..91099295e39 100644 --- a/torchgeo/datasets/geo.py +++ b/torchgeo/datasets/geo.py @@ -588,6 +588,19 @@ def _load_warp_file(self, filepath: str) -> DatasetReader: class VectorDataset(GeoDataset): """Abstract base class for :class:`GeoDataset` stored as vector files.""" + #: Regular expression used to extract date from filename. + #: + #: The expression should use named groups. The expression may contain any number of + #: groups. The following groups are specifically searched for by the base class: + #: + #: * ``date``: used to calculate ``mint`` and ``maxt`` for ``index`` insertion + filename_regex = ".*" + + #: Date format string used to parse date from filename. + #: + #: Not used if :attr:`filename_regex` does not contain a ``date`` group. + date_format = "%Y%m%d" + def __init__( self, paths: Union[str, Iterable[str]] = "data", From 3f8d1720ec23f94fc12ddd377fbf2d09bf2ca980 Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 18:01:41 +0100 Subject: [PATCH 3/7] added test --- ...{vector.geojson => vector_20240119T111111.geojson} | 0 tests/datasets/test_geo.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) rename tests/data/vector/{vector.geojson => vector_20240119T111111.geojson} (100%) diff --git a/tests/data/vector/vector.geojson b/tests/data/vector/vector_20240119T111111.geojson similarity index 100% rename from tests/data/vector/vector.geojson rename to tests/data/vector/vector_20240119T111111.geojson diff --git a/tests/datasets/test_geo.py b/tests/datasets/test_geo.py index 31b140e91f2..563d2e24677 100644 --- a/tests/datasets/test_geo.py +++ b/tests/datasets/test_geo.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import os import pickle +import sys from collections.abc import Iterable from pathlib import Path from typing import Optional, Union @@ -51,6 +52,10 @@ def __getitem__(self, query: BoundingBox) -> dict[str, BoundingBox]: class CustomVectorDataset(VectorDataset): filename_glob = "*.geojson" + date_format = "%Y%m%dT%H%M%S" + filename_regex = r""" + ^vector_(?P\d{8}T\d{6})\.geojson + """ class CustomSentinelDataset(Sentinel2): @@ -305,6 +310,10 @@ def test_getitem(self, dataset: CustomVectorDataset) -> None: torch.tensor([0, 1], dtype=torch.uint8), ) + def test_time_index(self, dataset: CustomVectorDataset) -> None: + assert dataset.index.bounds[4] > 0 + assert dataset.index.bounds[5] < sys.maxsize + def test_getitem_multilabel(self, multilabel: CustomVectorDataset) -> None: x = multilabel[multilabel.bounds] assert isinstance(x, dict) @@ -316,7 +325,7 @@ def test_getitem_multilabel(self, multilabel: CustomVectorDataset) -> None: ) def test_empty_shapes(self, dataset: CustomVectorDataset) -> None: - query = BoundingBox(1.1, 1.9, 1.1, 1.9, 0, 0) + query = BoundingBox(1.1, 1.9, 1.1, 1.9, 0, sys.maxsize) x = dataset[query] assert torch.equal(x["mask"], torch.zeros(8, 8, dtype=torch.uint8)) From 8afad7148b0e1e778d94ddc365837cfd8203214d Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 18:08:30 +0100 Subject: [PATCH 4/7] moving match --- torchgeo/datasets/geo.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/torchgeo/datasets/geo.py b/torchgeo/datasets/geo.py index 91099295e39..13c14367746 100644 --- a/torchgeo/datasets/geo.py +++ b/torchgeo/datasets/geo.py @@ -639,28 +639,29 @@ def __init__( i = 0 filename_regex = re.compile(self.filename_regex, re.VERBOSE) for filepath in self.files: - try: - with fiona.open(filepath) as src: - if crs is None: - crs = CRS.from_dict(src.crs) - - minx, miny, maxx, maxy = src.bounds - (minx, maxx), (miny, maxy) = fiona.transform.transform( - src.crs, crs.to_dict(), [minx, maxx], [miny, maxy] - ) - except fiona.errors.FionaValueError: - # Skip files that fiona is unable to read - continue - else: - mint = 0 - maxt = sys.maxsize - match = re.match(filename_regex, os.path.basename(filepath)) - if "date" in match.groupdict(): - date = match.group("date") - mint, maxt = disambiguate_timestamp(date, self.date_format) - coords = (minx, maxx, miny, maxy, mint, maxt) - self.index.insert(i, coords, filepath) - i += 1 + match = re.match(filename_regex, os.path.basename(filepath)) + if match is not None: + try: + with fiona.open(filepath) as src: + if crs is None: + crs = CRS.from_dict(src.crs) + + minx, miny, maxx, maxy = src.bounds + (minx, maxx), (miny, maxy) = fiona.transform.transform( + src.crs, crs.to_dict(), [minx, maxx], [miny, maxy] + ) + except fiona.errors.FionaValueError: + # Skip files that fiona is unable to read + continue + else: + mint = 0 + maxt = sys.maxsize + if "date" in match.groupdict(): + date = match.group("date") + mint, maxt = disambiguate_timestamp(date, self.date_format) + coords = (minx, maxx, miny, maxy, mint, maxt) + self.index.insert(i, coords, filepath) + i += 1 if i == 0: raise DatasetNotFoundError(self) From 4f5e53d64527eba7fd7ff83caa00d9b469c8a2df Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 18:10:26 +0100 Subject: [PATCH 5/7] simpler date --- .../{vector_20240119T111111.geojson => vector_2024.geojson} | 0 tests/datasets/test_geo.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/data/vector/{vector_20240119T111111.geojson => vector_2024.geojson} (100%) diff --git a/tests/data/vector/vector_20240119T111111.geojson b/tests/data/vector/vector_2024.geojson similarity index 100% rename from tests/data/vector/vector_20240119T111111.geojson rename to tests/data/vector/vector_2024.geojson diff --git a/tests/datasets/test_geo.py b/tests/datasets/test_geo.py index 563d2e24677..0e1a4ce17e3 100644 --- a/tests/datasets/test_geo.py +++ b/tests/datasets/test_geo.py @@ -52,9 +52,9 @@ def __getitem__(self, query: BoundingBox) -> dict[str, BoundingBox]: class CustomVectorDataset(VectorDataset): filename_glob = "*.geojson" - date_format = "%Y%m%dT%H%M%S" + date_format = "%Y" filename_regex = r""" - ^vector_(?P\d{8}T\d{6})\.geojson + ^vector_(?P\d{4})\.geojson """ From 6fd4de78171efb9ae83c5556511d56e3175f312f Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 18:13:37 +0100 Subject: [PATCH 6/7] changed the name in data --- tests/data/vector/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/vector/data.py b/tests/data/vector/data.py index 6c538a94fad..abf63c3ee66 100755 --- a/tests/data/vector/data.py +++ b/tests/data/vector/data.py @@ -56,5 +56,5 @@ ], } -with open("vector.geojson", "w") as f: +with open("vector_2024.geojson", "w") as f: json.dump(geojson, f) From 743b60919accf4a7faabf81567bb769085fbc875 Mon Sep 17 00:00:00 2001 From: Odd Eirik Igland Date: Fri, 19 Jan 2024 18:37:43 +0100 Subject: [PATCH 7/7] mypy --- torchgeo/datasets/geo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchgeo/datasets/geo.py b/torchgeo/datasets/geo.py index 13c14367746..611b363871d 100644 --- a/torchgeo/datasets/geo.py +++ b/torchgeo/datasets/geo.py @@ -654,8 +654,8 @@ def __init__( # Skip files that fiona is unable to read continue else: - mint = 0 - maxt = sys.maxsize + mint: float = 0 + maxt: float = sys.maxsize if "date" in match.groupdict(): date = match.group("date") mint, maxt = disambiguate_timestamp(date, self.date_format)