diff --git a/sno/dataset2.py b/sno/dataset2.py index b9e349833..ace75b388 100644 --- a/sno/dataset2.py +++ b/sno/dataset2.py @@ -64,6 +64,13 @@ def _bytes(data): return data +def _text(data): + """data (str or bytes) -> str. Utf-8.""" + if isinstance(data, bytes): + return data.decode('utf8') + return data + + def find_blobs_in_tree(tree, max_depth=4): """ Recursively yields possible blobs in the given directory tree, @@ -395,33 +402,34 @@ def get_data_at(self, path, as_str=False, missing_ok=False): leaf = None if hasattr(leaf, "data"): - return leaf.data.decode('utf8') if as_str else leaf.data + return _text(leaf.data) if as_str else leaf.data elif missing_ok: return None raise KeyError(f"No data found at path {path}, type={type(leaf)}") @functools.lru_cache() def get_meta_item(self, path, missing_ok=True): - from . import dataset2_gpkg + from . import gpkg_adapter # These items are not stored, but generated from other items that are stored. - # TODO: Maybe move gpkg specific things out of the dataset2 interface. - if path == "gpkg_contents": - return dataset2_gpkg.v2_to_gpkg_contents(self) - elif path == "gpkg_geometry_columns": - return dataset2_gpkg.v2_to_gpkg_geometry_columns(self) - elif path == "gpkg_spatial_ref_sys": - return dataset2_gpkg.v2_to_gpkg_spatial_ref_sys(self) - elif path == "sqlite_table_info": - return dataset2_gpkg.v2_to_sqlite_table_info(self) - elif path == "gpkg_metadata" or path == "gpkg_metadata_reference": - return None + if path in gpkg_adapter.GPKG_META_ITEMS: + return gpkg_adapter.get_meta_item(self, path) content_is_str = not path.startswith("legend/") return self.get_data_at( self.META_PATH + path, as_str=content_is_str, missing_ok=missing_ok ) + def get_srs_definition(self, srs_name): + """Return the SRS definition stored with the given name.""" + return self.get_meta_item(f"srs/{srs_name}.wkt") + + def srs_definitions(self): + """Return all stored srs definitions in a dict.""" + for blob in find_blobs_in_tree(self.tree / self.SRS_PATH): + # -4 -> Remove ".wkt" + yield blob.name[:-4], _text(blob.data) + @functools.lru_cache() def get_legend(self, legend_hash=None, *, path=None): """Load the legend with the given hash / at the given path from this dataset.""" @@ -559,7 +567,7 @@ def repo_path(self, rel_path): return f"{self.path}/{rel_path}" def import_iter_meta_blobs(self, repo, source): - schema = source.get_v2_schema() + schema = source.schema meta_blobs = [ (self.VERSION_PATH, self.VERSION_IMPORT), (self.TITLE_PATH, source.get_meta_item("title")), @@ -568,18 +576,15 @@ def import_iter_meta_blobs(self, repo, source): self.encode_legend(schema.legend), ] - # TODO - tidy up SRS ID code. - for srs in source.get_meta_item("gpkg_spatial_ref_sys"): - meta_blobs.append( - (f"{self.SRS_PATH}EPSG:{srs['srs_id']}.wkt", srs["definition"]) - ) + for path, definition in source.srs_definitions(): + meta_blobs.append((f"{self.SRS_PATH}{path}.wkt", definition)) for meta_path, meta_content in meta_blobs: if meta_content is not None: yield self.repo_path(meta_path), _bytes(meta_content) def import_iter_feature_blobs(self, resultset, source): - schema = source.get_v2_schema() + schema = source.schema for feature in resultset: feature_path, feature_content = self.encode_feature(feature, schema) yield self.repo_path(feature_path), feature_content diff --git a/sno/dataset2_gpkg.py b/sno/dataset2_gpkg.py deleted file mode 100644 index 4d61ff98a..000000000 --- a/sno/dataset2_gpkg.py +++ /dev/null @@ -1,182 +0,0 @@ -import re -from .dataset2 import Schema, ColumnSchema - - -def v2_to_gpkg_contents(dataset2): - """Generate a gpkg_contents meta item from a dataset v2""" - geom_columns = _get_geometry_columns(dataset2.schema) - is_spatial = bool(geom_columns) - result = { - "identifier": dataset2.get_meta_item("title"), - "description": dataset2.get_meta_item("description"), - "table_name": dataset2.tree.name, - "data_type": "features" if is_spatial else "attributes", - } - if is_spatial: - result["srs_id"] = srs_str_to_int( - geom_columns[0].extra_type_info["geometrySRS"] - ) - return result - - -def v2_to_gpkg_geometry_columns(dataset2): - """Generate a gpkg_geometry_columns meta item from a dataset v2""" - geom_columns = _get_geometry_columns(dataset2.schema) - if not geom_columns: - return None - - geom_column = geom_columns[0] - type_name, *zm = geom_column.extra_type_info["geometryType"].split(" ", 1) - srs_id = srs_str_to_int(geom_column.extra_type_info["geometrySRS"]) - zm = zm[0] if zm else "" - z = 1 if "Z" in zm else 0 - m = 1 if "M" in zm else 0 - return { - "table_name": dataset2.tree.name, - "column_name": geom_column.name, - "geometry_type_name": type_name, - "srs_id": srs_id, - "z": z, - "m": m, - } - - -def v2_to_gpkg_spatial_ref_sys(dataset2): - """Generate a gpkg_spatial_ref_sys meta item from a dataset v2""" - geom_columns = _get_geometry_columns(dataset2.schema) - if not geom_columns: - return [] - - srs_str = geom_columns[0].extra_type_info["geometrySRS"] - srs_id = srs_str_to_int(srs_str) - definition = dataset2.get_meta_item(f"srs/{srs_str}.wkt") - # This should be more complicated too. - # TODO: srs_name, description. - return [ - { - "srs_name": srs_str, # This name is not quite right. - "definition": definition, - "organization": "EPSG", - "srs_id": srs_id, - "organization_coordsys_id": srs_id, - } - ] - - -def v2_to_sqlite_table_info(dataset2): - return [_columnschema_to_gpkg(i, col) for i, col in enumerate(dataset2.schema)] - - -def _get_geometry_columns(schema): - return [c for c in schema.columns if c.data_type == "geometry"] - - -def srs_str_to_int(srs_str): - # This should be more complicated. - if srs_str.startswith("EPSG:"): - srs_str = srs_str[5:] - if srs_str.isdigit(): - return int(srs_str) - raise ValueError(f"Can't parse SRS ID: {srs_str}") - - -def srs_int_to_str(srs_int): - # This should be more complicated - return f"EPSG:{srs_int}" - - -def gpkg_to_v2_schema(sqlite_table_info, gpkg_geometry_columns, id_salt): - """Generate a v2 Schema from the given gpkg meta items.""" - return Schema( - [ - _gkpg_to_columnschema(col, gpkg_geometry_columns, id_salt) - for col in sorted(sqlite_table_info, key=_sort_by_cid) - ] - ) - - -def _sort_by_cid(sqlite_col_info): - return sqlite_col_info["cid"] - - -def _gkpg_to_columnschema(sqlite_col_info, gpkg_geometry_columns, id_salt): - name = sqlite_col_info["name"] - pk_index = 0 if sqlite_col_info["pk"] == 1 else None - if gpkg_geometry_columns and name == gpkg_geometry_columns["column_name"]: - data_type, extra_type_info = _gkpg_geometry_columns_to_v2_type( - gpkg_geometry_columns - ) - else: - data_type, extra_type_info = gpkg_type_to_v2_type(sqlite_col_info["type"]) - - col_id = ColumnSchema.deterministic_id(name, data_type, id_salt) - return ColumnSchema(col_id, name, data_type, pk_index, **extra_type_info) - - -def _columnschema_to_gpkg(cid, column_schema): - is_pk = 1 if column_schema.pk_index is not None else 0 - return { - "cid": cid, - "name": column_schema.name, - "pk": is_pk, - "type": v2_type_to_gpkg_type(column_schema), - "notnull": 0, - "dflt_value": None, - } - - -_GPKG_TYPE_TO_V2_TYPE = { - "SMALLINT": ("integer", {"size": 16}), - "MEDIUMINT": ("integer", {"size": 32}), - "INTEGER": ("integer", {"size": 64}), - "REAL": ("float", {"size": 32}), - "FLOAT": ("float", {"size": 32}), - "DOUBLE": ("float", {"size": 64}), -} - - -_V2_TYPE_TO_GPKG_TYPE = { - "integer": {0: "INTEGER", 16: "SMALLINT", 32: "MEDIUMINT", 64: "INTEGER"}, - "float": {0: "FLOAT", 32: "FLOAT", 64: "DOUBLE"}, -} - - -def gpkg_type_to_v2_type(gkpg_type): - """Convert a gpkg type to v2 schema type.""" - m = re.match(r"^(TEXT|BLOB)\(([0-9]+)\)$", gkpg_type) - if m: - return m.group(1).lower(), {"length": int(m.group(2))} - v2_type = _GPKG_TYPE_TO_V2_TYPE.get(gkpg_type) - if v2_type is None: - v2_type = (gkpg_type.lower(), {}) - return v2_type - - -def _gkpg_geometry_columns_to_v2_type(ggc): - geometry_type = ggc["geometry_type_name"] - z = "Z" if ggc["z"] else "" - m = "M" if ggc["m"] else "" - srs_id = ggc["srs_id"] - extra_type_info = { - "geometryType": f"{geometry_type} {z}{m}".strip(), - "geometrySRS": srs_int_to_str(srs_id), - } - return "geometry", extra_type_info - - -def v2_type_to_gpkg_type(column_schema): - """Convert a v2 schema type to a gpkg type.""" - v2_type = column_schema.data_type - extra_type_info = column_schema.extra_type_info - if column_schema.data_type == "geometry": - return extra_type_info["geometryType"].split(" ", 1)[0] - - gpkg_types = _V2_TYPE_TO_GPKG_TYPE.get(v2_type) - if gpkg_types: - return gpkg_types.get(extra_type_info.get("size", 0)) - - length = extra_type_info.get("length", None) - if length: - return f"{v2_type.upper()}({length})" - - return v2_type.upper() diff --git a/sno/gpkg_adapter.py b/sno/gpkg_adapter.py new file mode 100644 index 000000000..d8d824239 --- /dev/null +++ b/sno/gpkg_adapter.py @@ -0,0 +1,246 @@ +import re +from .dataset2 import Schema, ColumnSchema +from osgeo.osr import SpatialReference + + +GPKG_META_ITEMS = ( + "gpkg_contents", + "gpkg_geometry_columns", + "gpkg_spatial_ref_sys", + "sqlite_table_info", + "gkpg_metadata", + "gpkg_metadata_reference", +) + + +def get_meta_item(dataset, path): + if path not in GPKG_META_ITEMS: + raise KeyError(f"Not a gpkg meta_item: {path}") + + if path == "gpkg_contents": + return generate_gpkg_contents(dataset) + elif path == "gpkg_geometry_columns": + return generate_gpkg_geometry_columns(dataset) + elif path == "gpkg_spatial_ref_sys": + return generate_gpkg_spatial_ref_sys(dataset) + elif path == "sqlite_table_info": + return generate_sqlite_table_info(dataset) + elif path == "gpkg_metadata" or path == "gpkg_metadata_reference": + return None + + +def generate_gpkg_contents(dataset2): + """Generate a gpkg_contents meta item from a dataset.""" + gpkg_spatial_ref_sys = dataset2.get_meta_item("gpkg_spatial_ref_sys") + is_spatial = bool(gpkg_spatial_ref_sys) + + result = { + "identifier": dataset2.get_meta_item("title"), + "description": dataset2.get_meta_item("description"), + "table_name": dataset2.tree.name, + "data_type": "features" if is_spatial else "attributes", + } + if is_spatial: + result["srs_id"] = gpkg_spatial_ref_sys[0]["srs_id"] + return result + + +def generate_gpkg_geometry_columns(dataset2): + """Generate a gpkg_geometry_columns meta item from a dataset.""" + geom_columns = _get_geometry_columns(dataset2.schema) + if not geom_columns: + return None + + geometry_type = geom_columns[0].extra_type_info["geometryType"] + type_name, *zm = geometry_type.split(" ", 1) + zm = zm[0] if zm else "" + z = 1 if "Z" in zm else 0 + m = 1 if "M" in zm else 0 + gpkg_spatial_ref_sys = dataset2.get_meta_item("gpkg_spatial_ref_sys") + return { + "table_name": dataset2.tree.name, + "column_name": geom_columns[0].name, + "geometry_type_name": type_name, + "srs_id": gpkg_spatial_ref_sys[0]["srs_id"], + "z": z, + "m": m, + } + + +def generate_gpkg_spatial_ref_sys(dataset): + """Generate a gpkg_spatial_ref_sys meta item from a dataset.""" + geom_columns = _get_geometry_columns(dataset.schema) + if not geom_columns: + return [] + + srs_pathname = geom_columns[0].extra_type_info["geometrySRS"] + definition = dataset.get_srs_definition(srs_pathname) + return wkt_to_gpkg_spatial_ref_sys(definition) + + +DEFAULT_GPKG_SPATIAL_REF_SYS = [ + { + "srs_name": "Unknown CRS", + "definition": "", + "organization": "EPSG", + "srs_id": 0, + "organization_coordsys_id": 0, + "description": None, + } +] + + +def wkt_to_gpkg_spatial_ref_sys(wkt): + """Given a WKT srs definition, generate a gpkg_spatial_ref_sys meta item.""" + return _gpkg_spatial_ref_sys(SpatialReference(wkt), wkt) + + +def osgeo_to_gpkg_spatial_ref_sys(spatial_ref): + """Given an osgeo SpatialReference, generate a gpkg_spatial_ref_sys meta item.""" + + return _gpkg_spatial_ref_sys(spatial_ref, spatial_ref.ExportToWkt()) + + +def _gpkg_spatial_ref_sys(spatial_ref, wkt): + spatial_ref.AutoIdentifyEPSG() + return [ + { + "srs_name": spatial_ref.GetName(), + "definition": wkt, + "organization": spatial_ref.GetAuthorityName(None), + "srs_id": spatial_ref.GetAuthorityCode(None), + "organization_coordsys_id": spatial_ref.GetAuthorityCode(None), + "description": None, + } + ] + + +DEFAULT_SRS_STR = "EPSG:0" + + +def wkt_to_srs_str(wkt): + """Given a WKT srs definition, generate a sensible name for it.""" + return osgeo_to_srs_str(SpatialReference(wkt)) + + +def osgeo_to_srs_str(spatial_ref): + """Given a osgeo SpatialReference, generate a sensible name for it.""" + auth_name = spatial_ref.GetAuthorityName(None) + auth_code = spatial_ref.GetAuthorityCode(None) + return f"{auth_name}:{auth_code}" + + +def generate_sqlite_table_info(dataset): + """Generate a sqlite_table_info meta item from a dataset.""" + return [_columnschema_to_gpkg(i, col) for i, col in enumerate(dataset.schema)] + + +def _get_geometry_columns(schema): + return [c for c in schema.columns if c.data_type == "geometry"] + + +def gpkg_to_v2_schema( + sqlite_table_info, gpkg_geometry_columns, gpkg_spatial_ref_sys, id_salt +): + """Generate a v2 Schema from the given gpkg meta items.""" + return Schema( + [ + _gpkg_to_column_schema( + col, gpkg_geometry_columns, gpkg_spatial_ref_sys, id_salt + ) + for col in sorted(sqlite_table_info, key=_sort_by_cid) + ] + ) + + +def _sort_by_cid(sqlite_col_info): + return sqlite_col_info["cid"] + + +def _gpkg_to_column_schema( + sqlite_col_info, gpkg_geometry_columns, gpkg_spatial_ref_sys, id_salt +): + name = sqlite_col_info["name"] + pk_index = 0 if sqlite_col_info["pk"] == 1 else None + if gpkg_geometry_columns and name == gpkg_geometry_columns["column_name"]: + data_type, extra_type_info = _gkpg_geometry_columns_to_v2_type( + gpkg_geometry_columns, gpkg_spatial_ref_sys, + ) + else: + data_type, extra_type_info = gpkg_type_to_v2_type(sqlite_col_info["type"]) + + col_id = ColumnSchema.deterministic_id(name, data_type, id_salt) + return ColumnSchema(col_id, name, data_type, pk_index, **extra_type_info) + + +def _columnschema_to_gpkg(cid, column_schema): + is_pk = 1 if column_schema.pk_index is not None else 0 + return { + "cid": cid, + "name": column_schema.name, + "pk": is_pk, + "type": v2_type_to_gpkg_type(column_schema), + "notnull": 0, + "dflt_value": None, + } + + +_GPKG_TYPE_TO_V2_TYPE = { + "SMALLINT": ("integer", {"size": 16}), + "MEDIUMINT": ("integer", {"size": 32}), + "INTEGER": ("integer", {"size": 64}), + "REAL": ("float", {"size": 32}), + "FLOAT": ("float", {"size": 32}), + "DOUBLE": ("float", {"size": 64}), +} + + +_V2_TYPE_TO_GPKG_TYPE = { + "integer": {0: "INTEGER", 16: "SMALLINT", 32: "MEDIUMINT", 64: "INTEGER"}, + "float": {0: "FLOAT", 32: "FLOAT", 64: "DOUBLE"}, +} + + +def gpkg_type_to_v2_type(gkpg_type): + """Convert a gpkg type to v2 schema type.""" + m = re.match(r"^(TEXT|BLOB)\(([0-9]+)\)$", gkpg_type) + if m: + return m.group(1).lower(), {"length": int(m.group(2))} + v2_type = _GPKG_TYPE_TO_V2_TYPE.get(gkpg_type) + if v2_type is None: + v2_type = (gkpg_type.lower(), {}) + return v2_type + + +def _gkpg_geometry_columns_to_v2_type(ggc, gsrs): + geometry_type = ggc["geometry_type_name"] + z = "Z" if ggc["z"] else "" + m = "M" if ggc["m"] else "" + + srs_str = DEFAULT_SRS_STR + if gsrs and gsrs[0]["definition"]: + srs_str = wkt_to_srs_str(gsrs[0]["definition"]) + + extra_type_info = { + "geometryType": f"{geometry_type} {z}{m}".strip(), + "geometrySRS": srs_str, + } + return "geometry", extra_type_info + + +def v2_type_to_gpkg_type(column_schema): + """Convert a v2 schema type to a gpkg type.""" + v2_type = column_schema.data_type + extra_type_info = column_schema.extra_type_info + if column_schema.data_type == "geometry": + return extra_type_info["geometryType"].split(" ", 1)[0] + + gpkg_types = _V2_TYPE_TO_GPKG_TYPE.get(v2_type) + if gpkg_types: + return gpkg_types.get(extra_type_info.get("size", 0)) + + length = extra_type_info.get("length", None) + if length: + return f"{v2_type.upper()}({length})" + + return v2_type.upper() diff --git a/sno/init.py b/sno/init.py index c57a6c4c0..13d85e8cd 100644 --- a/sno/init.py +++ b/sno/init.py @@ -26,6 +26,12 @@ from .output_util import dump_json_output, get_input_mode, InputMode from .timestamps import datetime_to_iso8601_utc from .utils import ungenerator +from .gpkg_adapter import ( + osgeo_to_gpkg_spatial_ref_sys, + DEFAULT_GPKG_SPATIAL_REF_SYS, + osgeo_to_srs_str, + DEFAULT_SRS_STR, +) # This defines what formats are allowed, as well as mapping @@ -459,6 +465,23 @@ def get_meta_item(self, key): elif key == "gpkg_spatial_ref_sys": return self.get_meta_spatial_ref_sys() + def get_srs_definition(self, srs_name): + if self.is_spatial and srs_name == self._srs_name(): + return self._srs_definition() + raise KeyError(srs_name) + + def srs_definitions(self): + if self.is_spatial: + yield (self._srs_name(), self._srs_definition()) + + def _srs_name(self): + srs = self.ogrlayer.GetSpatialRef() + return osgeo_to_srs_str(srs) if srs else DEFAULT_SRS_STR + + def _srs_definition(self): + srs = self.ogrlayer.GetSpatialRef() + return srs.ExportToWkt() if srs else "" + def _get_meta_geometry_type(self): # remove Z/M components ogr_geom_type = ogr.GT_Flatten(self.ogrlayer.GetGeomType()) @@ -543,8 +566,9 @@ def _field_to_v2_column_schema(self, fd): ColumnSchema.new_id(), fd.GetName(), data_type, None, **extra_type_info ) + @property @functools.lru_cache(maxsize=1) - def get_v2_schema(self): + def schema(self): from .dataset2 import Schema, ColumnSchema ld = self.ogrlayer.GetLayerDefn() @@ -590,18 +614,12 @@ def get_meta_column_info(self): "pk": int(name == self.primary_key), } - @ungenerator(list) def get_meta_spatial_ref_sys(self): srs = self.ogrlayer.GetSpatialRef() - srid = self._get_meta_srid() - yield { - 'srs_name': srs.GetName() if srs else 'Unknown CRS', - 'srs_id': srid, - 'organization': 'EPSG', - 'organization_coordsys_id': srid, - 'definition': srs.ExportToWkt() if srs else '', - 'description': None, - } + if srs: + return osgeo_to_gpkg_spatial_ref_sys(srs) + else: + return DEFAULT_GPKG_SPATIAL_REF_SYS _KNOWN_METADATA_URIS = { 'GDALMultiDomainMetadata': 'http://gdal.org', diff --git a/sno/upgrade/upgrade_02_05.py b/sno/upgrade/upgrade_02_05.py index 2bad580c2..eaf31a85d 100644 --- a/sno/upgrade/upgrade_02_05.py +++ b/sno/upgrade/upgrade_02_05.py @@ -8,7 +8,7 @@ from sno.core import walk_tree from sno.dataset1 import Dataset1 -from sno.dataset2_gpkg import gpkg_to_v2_schema +from sno.gpkg_adapter import gpkg_to_v2_schema, wkt_to_srs_str from sno.fast_import import fast_import_tables @@ -126,6 +126,8 @@ def _find_datasets(source_tree): class ImportV1Dataset: + # TODO: make ImportV1Dataset the same class as Dataset1 - they are almost the same already. + def __init__(self, dataset): assert dataset.version == "1.0" self.dataset = dataset @@ -133,12 +135,17 @@ def __init__(self, dataset): self.table = self.path self.source = "v1-sno-repo" + @property @functools.lru_cache(maxsize=1) - def get_v2_schema(self): + def schema(self): sqlite_table_info = self.dataset.get_meta_item("sqlite_table_info") gpkg_geometry_columns = self.dataset.get_meta_item("gpkg_geometry_columns") + gpkg_spatial_ref_sys = self.dataset.get_meta_item("gpkg_spatial_ref_sys") return gpkg_to_v2_schema( - sqlite_table_info, gpkg_geometry_columns, id_salt=self.path + sqlite_table_info, + gpkg_geometry_columns, + gpkg_spatial_ref_sys, + id_salt=self.path, ) def get_meta_item(self, key): @@ -149,6 +156,12 @@ def get_meta_item(self, key): else: return self.dataset.get_meta_item(key) + def srs_definitions(self): + gsrs = self.dataset.get_meta_item("gpkg_spatial_ref_sys") + if gsrs and gsrs[0]["definition"]: + definition = gsrs[0]["definition"] + yield wkt_to_srs_str(definition), definition + def iter_features(self): for _, feature in self.dataset.features(): yield feature diff --git a/tests/test_structure.py b/tests/test_structure.py index da44d326c..043fbbba4 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -898,7 +898,7 @@ def test_write_feature_performance( "cast_primary_key": False, } else: - kwargs = {"schema": source.get_v2_schema()} + kwargs = {"schema": source.schema} def _write_feature(): feature = next(feature_iter)